From 0bc784982f183d9d50be31adb867e84c237d9fc3 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Mon, 18 Mar 2019 20:07:37 +0000 Subject: COMPMID-1958: Implements 1D FFT in OpenCL. Forward complex FFT implementation. Change-Id: Ia0ba8740072e5adb06f8ead462a47abc8b5dd125 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/904 Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins --- arm_compute/core/CL/CLKernels.h | 2 + .../core/CL/kernels/CLFFTDigitReverseKernel.h | 78 ++ .../core/CL/kernels/CLFFTRadixStageKernel.h | 87 ++ arm_compute/core/KernelDescriptors.h | 38 + arm_compute/core/utils/helpers/fft.h | 55 ++ arm_compute/runtime/CL/CLFunctions.h | 1 + arm_compute/runtime/CL/functions/CLFFT1D.h | 79 ++ arm_compute/runtime/FunctionDescriptors.h | 35 + src/core/CL/CLKernelLibrary.cpp | 25 +- src/core/CL/cl_kernels/fft.cl | 1014 ++++++++++++++++++++ src/core/CL/kernels/CLFFTDigitReverseKernel.cpp | 124 +++ src/core/CL/kernels/CLFFTRadixStageKernel.cpp | 163 ++++ src/core/utils/helpers/fft.cpp | 124 +++ src/runtime/CL/functions/CLFFT1D.cpp | 119 +++ tests/benchmark/CL/FFT.cpp | 55 ++ tests/benchmark/fixtures/FFTFixture.h | 83 ++ tests/validation/CL/FFT.cpp | 125 +++ tests/validation/fixtures/FFTFixture.h | 110 +++ 18 files changed, 2313 insertions(+), 4 deletions(-) create mode 100644 arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h create mode 100644 arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h create mode 100644 arm_compute/core/KernelDescriptors.h create mode 100644 arm_compute/core/utils/helpers/fft.h create mode 100644 arm_compute/runtime/CL/functions/CLFFT1D.h create mode 100644 arm_compute/runtime/FunctionDescriptors.h create mode 100644 src/core/CL/cl_kernels/fft.cl create mode 100644 src/core/CL/kernels/CLFFTDigitReverseKernel.cpp create mode 100644 src/core/CL/kernels/CLFFTRadixStageKernel.cpp create mode 100644 src/core/utils/helpers/fft.cpp create mode 100644 src/runtime/CL/functions/CLFFT1D.cpp create mode 100644 tests/benchmark/CL/FFT.cpp create mode 100644 tests/benchmark/fixtures/FFTFixture.h create mode 100644 tests/validation/CL/FFT.cpp create mode 100644 tests/validation/fixtures/FFTFixture.h diff --git a/arm_compute/core/CL/CLKernels.h b/arm_compute/core/CL/CLKernels.h index 2fd2341e48..b767812fc8 100644 --- a/arm_compute/core/CL/CLKernels.h +++ b/arm_compute/core/CL/CLKernels.h @@ -64,6 +64,8 @@ #include "arm_compute/core/CL/kernels/CLElementWiseUnaryLayerKernel.h" #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/core/CL/kernels/CLErodeKernel.h" +#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h" #include "arm_compute/core/CL/kernels/CLFastCornersKernel.h" #include "arm_compute/core/CL/kernels/CLFillBorderKernel.h" #include "arm_compute/core/CL/kernels/CLFlattenLayerKernel.h" diff --git a/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h b/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h new file mode 100644 index 0000000000..10652cdb4d --- /dev/null +++ b/arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H__ +#define __ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +namespace arm_compute +{ +// Forward declarations +class ICLTensor; + +/** Interface for the digit reverse operation kernel. */ +class CLFFTDigitReverseKernel : public ICLKernel +{ +public: + /** Constructor */ + CLFFTDigitReverseKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFFTDigitReverseKernel(const CLFFTDigitReverseKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFFTDigitReverseKernel &operator=(const CLFFTDigitReverseKernel &) = delete; + /** Default Move Constructor. */ + CLFFTDigitReverseKernel(CLFFTDigitReverseKernel &&) = default; + /** Default move assignment operator */ + CLFFTDigitReverseKernel &operator=(CLFFTDigitReverseKernel &&) = default; + /** Default destructor */ + ~CLFFTDigitReverseKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] idx Digit reverse index tensor. Data type supported: U32 + * @param[in] axis Axis to perform digit reverse on. + */ + void configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, unsigned int axis); + /** Static function to check if given info will lead to a valid configuration of @ref CLFFTDigitReverseKernel + * + * @param[in] input Source tensor info. Data types supported: F32. + * @param[in] output Destination tensor info. Data type supported: same as @p input + * @param[in] idx Digit reverse index tensor info. Data type supported: U32 + * @param[in] axis Axis to perform digit reverse on. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + const ICLTensor *_input; + ICLTensor *_output; + const ICLTensor *_idx; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLFFTDIGITREVERSEKERNEL_H__ */ diff --git a/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h b/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h new file mode 100644 index 0000000000..9de775eafa --- /dev/null +++ b/arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H__ +#define __ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H__ + +#include "arm_compute/core/CL/ICLKernel.h" + +#include "arm_compute/core/KernelDescriptors.h" + +#include + +namespace arm_compute +{ +// Forward declarations +class ICLTensor; + +/** Interface for the FFT radix stage kernel. */ +class CLFFTRadixStageKernel : public ICLKernel +{ +public: + /** Constructor */ + CLFFTRadixStageKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFFTRadixStageKernel(const CLFFTRadixStageKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + CLFFTRadixStageKernel &operator=(const CLFFTRadixStageKernel &) = delete; + /** Default Move Constructor. */ + CLFFTRadixStageKernel(CLFFTRadixStageKernel &&) = default; + /** Default move assignment operator */ + CLFFTRadixStageKernel &operator=(CLFFTRadixStageKernel &&) = default; + /** Default destructor */ + ~CLFFTRadixStageKernel() = default; + /** Set the input and output tensors. + * + * @note If the output tensor is nullptr, the FFT will be performed in-place + * + * @param[in,out] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Can be nullptr. Data type supported: same as @p input + * @param[in] config FFT descriptor metadata. + */ + void configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelDescriptor &config); + /** Static function to check if given info will lead to a valid configuration of @ref CLFFTRadixStageKernel + * + * @param[in] input Source tensor info. Data types supported: F32. + * @param[in] output Destination tensor info. Can be nullptr. Data type supported: same as @p input + * @param[in] config FFT descriptor metadata. + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelDescriptor &config); + /** Returns the radix that are support by the FFT kernel + * + * @return A set of supported radix + */ + static std::set supported_radix(); + + // Inherited methods overridden: + void run(const Window &window, cl::CommandQueue &queue) override; + +private: + ICLTensor *_input; + ICLTensor *_output; + bool _run_in_place; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLFFTRADIXSTAGEKERNEL_H__ */ diff --git a/arm_compute/core/KernelDescriptors.h b/arm_compute/core/KernelDescriptors.h new file mode 100644 index 0000000000..186dbfb6d8 --- /dev/null +++ b/arm_compute/core/KernelDescriptors.h @@ -0,0 +1,38 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H__ +#define __ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H__ + +namespace arm_compute +{ +/** Descriptor used by the FFT core kernels */ +struct FFTRadixStageKernelDescriptor +{ + unsigned int axis{ 0 }; /**< Axis to run the FFT on. */ + unsigned int radix{ 0 }; /**< Radix to use. */ + unsigned int Nx{ 0 }; /**< Nx coefficient. */ + bool is_first_stage{ false }; /**< Flags if the FFT kernels is the first stage of a decomposed FFT. */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_CORE_KERNEL_DESCRIPTORS_H__ */ diff --git a/arm_compute/core/utils/helpers/fft.h b/arm_compute/core/utils/helpers/fft.h new file mode 100644 index 0000000000..bd84a5c63d --- /dev/null +++ b/arm_compute/core/utils/helpers/fft.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_UTILS_HELPERS_FFT_H__ +#define __ARM_COMPUTE_UTILS_HELPERS_FFT_H__ + +#include +#include + +namespace arm_compute +{ +namespace helpers +{ +namespace fft +{ +/** Decompose a given 1D input size using the provided supported factors. + * + * @param[in] N Input size to be decomposed. + * @param[in] supported_factors Supported factors that can be used for decomposition. + * + * @return A vector with the stages of the decomposition. Will be empty if decomposition failed. + */ +std::vector decompose_stages(unsigned int N, const std::set &supported_factors); +/** Calculate digit reverse index vector given fft size and the decomposed stages + * + * @param N Input size to calculate digit reverse for + * @param fft_stages A vector with the FFT decomposed stages + * + * @return A vector with the digit reverse indices. Will be empty if it failed. + */ +std::vector digit_reverse_indices(unsigned int N, const std::vector &fft_stages); +} // namespace fft +} // namespace helpers +} // namespace arm_compute +#endif /* __ARM_COMPUTE_UTILS_HELPERS_FFT_H__ */ diff --git a/arm_compute/runtime/CL/CLFunctions.h b/arm_compute/runtime/CL/CLFunctions.h index 42897a6e23..46e43dc0a9 100644 --- a/arm_compute/runtime/CL/CLFunctions.h +++ b/arm_compute/runtime/CL/CLFunctions.h @@ -65,6 +65,7 @@ #include "arm_compute/runtime/CL/functions/CLElementwiseOperations.h" #include "arm_compute/runtime/CL/functions/CLEqualizeHistogram.h" #include "arm_compute/runtime/CL/functions/CLErode.h" +#include "arm_compute/runtime/CL/functions/CLFFT1D.h" #include "arm_compute/runtime/CL/functions/CLFastCorners.h" #include "arm_compute/runtime/CL/functions/CLFillBorder.h" #include "arm_compute/runtime/CL/functions/CLFlattenLayer.h" diff --git a/arm_compute/runtime/CL/functions/CLFFT1D.h b/arm_compute/runtime/CL/functions/CLFFT1D.h new file mode 100644 index 0000000000..1612cf7f50 --- /dev/null +++ b/arm_compute/runtime/CL/functions/CLFFT1D.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_CLFFT1D_H__ +#define __ARM_COMPUTE_CLFFT1D_H__ + +#include "arm_compute/runtime/IFunction.h" + +#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h" +#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h" +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/FunctionDescriptors.h" + +namespace arm_compute +{ +// Forward declaration +class ICLTensor; + +/** Basic function to execute one dimensional FFT. This function calls the following OpenCL kernels: + * + * -# @ref CLFFTDigitReverseKernel Performs digit reverse + * -# @ref CLFFTRadixStageKernel A list of FFT kernels depending on the radix decomposition + */ +class CLFFT1D : public IFunction +{ +public: + /** Default Constructor */ + CLFFT1D(std::shared_ptr memory_manager = nullptr); + /** Initialise the function's source, destinations and border mode. + * + * @param[in] input Source tensor. Data types supported: F32. + * @param[out] output Destination tensor. Data types and data layouts supported: Same as @p input. + * @param[in] config FFT related configuration + */ + void configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config); + /** Static function to check if given info will lead to a valid configuration of @ref CLFFT1D. + * + * @param[in] input Source tensor info. Data types supported: F32. + * @param[in] output Destination tensor info. Data types and data layouts supported: Same as @p input. + * @param[in] config FFT related configuration + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config); + + // Inherited methods overridden: + void run() override; + +protected: + CLMemoryGroup _memory_group; + CLTensor _digit_reversed_input; + CLTensor _digit_reverse_indices; + CLFFTDigitReverseKernel _digit_reverse_kernel; + std::unique_ptr _fft_kernels; + unsigned int _num_ffts; +}; +} // namespace arm_compute +#endif /*__ARM_COMPUTE_CLFFT1D_H__ */ diff --git a/arm_compute/runtime/FunctionDescriptors.h b/arm_compute/runtime/FunctionDescriptors.h new file mode 100644 index 0000000000..7ff25019e6 --- /dev/null +++ b/arm_compute/runtime/FunctionDescriptors.h @@ -0,0 +1,35 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H__ +#define __ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H__ + +namespace arm_compute +{ +/** Descriptor used by the FFT1d function */ +struct FFT1DInfo +{ + unsigned int axis{ 0 }; /**< Axis to run the FFT on. */ +}; +} // namespace arm_compute +#endif /* __ARM_COMPUTE_RUNTIME_FUNCTION_DESCRIPTORS_H__ */ diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 0c895ce5c6..818039c184 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -219,6 +219,7 @@ const std::map CLKernelLibrary::_kernel_program_map = { "depthwise_convolution_3x3_f16", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_nhwc", "depthwise_convolution.cl" }, { "depthwise_convolution_3x3_nhwc_stride1", "depthwise_convolution.cl" }, + { "digit_reverse", "fft.cl" }, { "dwc_3x3_native_qasymm8_nchw", "depthwise_convolution_quantized.cl" }, { "dwc_3x3_native_qasymm8_dot8_nchw", "depthwise_convolution_quantized.cl" }, { "dwc_3x3_reshaped_qasymm8_nhwc", "depthwise_convolution_quantized.cl" }, @@ -260,12 +261,24 @@ const std::map CLKernelLibrary::_kernel_program_map = { "elementwise_unary", "elementwise_unary.cl" }, { "erode", "erode.cl" }, { "fast_corners", "fast_corners.cl" }, - { "flatten", "flatten.cl" }, + { "fft_radix_2_first_stage_axis_0", "fft.cl" }, + { "fft_radix_2_axis_0", "fft.cl" }, + { "fft_radix_3_first_stage_axis_0", "fft.cl" }, + { "fft_radix_3_axis_0", "fft.cl" }, + { "fft_radix_4_first_stage_axis_0", "fft.cl" }, + { "fft_radix_4_axis_0", "fft.cl" }, + { "fft_radix_5_first_stage_axis_0", "fft.cl" }, + { "fft_radix_5_axis_0", "fft.cl" }, + { "fft_radix_7_first_stage_axis_0", "fft.cl" }, + { "fft_radix_7_axis_0", "fft.cl" }, + { "fft_radix_8_first_stage_axis_0", "fft.cl" }, + { "fft_radix_8_axis_0", "fft.cl" }, { "fill_image_borders_constant", "fill_border.cl" }, { "fill_image_borders_replicate", "fill_border.cl" }, { "finalize", "optical_flow_pyramid_lk.cl" }, - { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" }, + { "flatten", "flatten.cl" }, { "floor_layer", "floor.cl" }, + { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" }, { "gather", "gather.cl" }, { "gaussian1x5_sub_x", "gaussian_pyramid.cl" }, { "gaussian5x1_sub_y", "gaussian_pyramid.cl" }, @@ -686,12 +699,16 @@ const std::map CLKernelLibrary::_program_source_map = #include "./cl_kernels/fast_corners.clembed" }, { - "flatten.cl", -#include "./cl_kernels/flatten.clembed" + "fft.cl", +#include "./cl_kernels/fft.clembed" }, { "fill_border.cl", #include "./cl_kernels/fill_border.clembed" + }, + { + "flatten.cl", +#include "./cl_kernels/flatten.clembed" }, { "floor.cl", diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/fft.cl new file mode 100644 index 0000000000..5f1ef2483b --- /dev/null +++ b/src/core/CL/cl_kernels/fft.cl @@ -0,0 +1,1014 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** Computes the digit reverse stage + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] idx_ptr Pointer to the index tensor. Supported data types: U32 + * @param[in] idx_stride_x Stride of the index tensor in X dimension (in bytes) + * @param[in] idx_step_x idx_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] idx_offset_first_element_in_bytes The offset of the first element in the index tensor + */ +__kernel void digit_reverse( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + VECTOR_DECLARATION(idx)) +{ + // Get tensor pointers + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Vector idx = CONVERT_TO_VECTOR_STRUCT(idx); + + const unsigned int iidx = *((__global uint *)(idx.ptr)); + + // Load data + float2 data = vload2(0, (__global float *)tensor3D_offset(&src, iidx, get_global_id(1), get_global_id(2))); + + // Store result + vstore2(data, 0, (__global float *)dst.ptr); +} + +/** Calculates and applies the twiddle factor to a given input. + * + * @param[in] phi The angle. + * @param[in,out] input The input on which the factor should be applied. + */ +#define TWIDDLE_FACTOR_MULTIPLICATION(phi, input) \ + { \ + float2 w, tmp; \ + w.x = native_cos(phi); \ + w.y = native_sin(phi); \ + tmp.x = (w.x * input.x) - (w.y * input.y); \ + tmp.y = (w.x * input.y) + (w.y * input.x); \ + input = tmp; \ + } + +/** Computes radix-2 butterfly unit. + * + * @param[in,out] c0 Complex input 0. + * @param[in,out] c1 Complex input 1. + */ +#define DFT_2(c0, c1) \ + { \ + float2 v0; \ + v0 = c0; \ + c0 = v0 + c1; \ + c1 = v0 - c1; \ + } + +// radix-3 butterfly unit factors +#define SQRT3DIV2 0.86602540378443f + +/** Computes radix-3 butterfly unit. + * + * @param[in,out] c0 Complex input 0. + * @param[in,out] c1 Complex input 1. + * @param[in,out] c2 Complex input 2. + */ +#define DFT_3(c0, c1, c2) \ + { \ + float2 v0 = c1 + c2; \ + float2 v1 = c1 - c2; \ + c1.x = c0.x - 0.5f * v0.x + v1.y * SQRT3DIV2; \ + c1.y = c0.y - 0.5f * v0.y - v1.x * SQRT3DIV2; \ + c2.x = c0.x - 0.5f * v0.x - v1.y * SQRT3DIV2; \ + c2.y = c0.y - 0.5f * v0.y + v1.x * SQRT3DIV2; \ + c0 = c0 + v0; \ + } + +/**Computes radix-4 butterfly unit. + * + * @param[in,out] c0 Complex input 0. + * @param[in,out] c1 Complex input 1. + * @param[in,out] c2 Complex input 2. + * @param[in,out] c3 Complex input 3. + */ +#define DFT_4(c0, c1, c2, c3) \ + { \ + float2 v0, v1, v2, v3; \ + v0 = c0 + c2; \ + v1 = c1 + c3; \ + v2 = c0 - c2; \ + v3.x = c1.y - c3.y; \ + v3.y = c3.x - c1.x; \ + c0 = v0 + v1; \ + c2 = v0 - v1; \ + c1 = v2 + v3; \ + c3 = v2 - v3; \ + } + +// radix-5 butterfly unit factors +#define W5_A 0.30901699437494f +#define W5_B 0.95105651629515f +#define W5_C 0.80901699437494f +#define W5_D 0.58778525229247f + +/** Computes radix-5 butterfly unit. + * + * @param[in,out] c0 Complex input 0. + * @param[in,out] c1 Complex input 1. + * @param[in,out] c2 Complex input 2. + * @param[in,out] c3 Complex input 3. + * @param[in,out] c4 Complex input 4. + */ +#define DFT_5(c0, c1, c2, c3, c4) \ + { \ + float2 v0, v1, v2, v3, v4; \ + v0 = c0; \ + v1 = W5_A * (c1 + c4) - W5_C * (c2 + c3); \ + v2 = W5_C * (c1 + c4) - W5_A * (c2 + c3); \ + v3 = W5_D * (c1 - c4) - W5_B * (c2 - c3); \ + v4 = W5_B * (c1 - c4) + W5_D * (c2 - c3); \ + c0 = v0 + c1 + c2 + c3 + c4; \ + c1 = v0 + v1 + (float2)(v4.y, -v4.x); \ + c2 = v0 - v2 + (float2)(v3.y, -v3.x); \ + c3 = v0 - v2 + (float2)(-v3.y, v3.x); \ + c4 = v0 + v1 + (float2)(-v4.y, v4.x); \ + } + +// radix-7 butterfly unit factors +#define W7_A 0.62348980185873f +#define W7_B 0.78183148246802f +#define W7_C 0.22252093395631f +#define W7_D 0.97492791218182f +#define W7_E 0.90096886790241f +#define W7_F 0.43388373911755f + +/** Computes radix-7 butterfly unit. + * + * @param[in,out] c0 Complex input 0. + * @param[in,out] c1 Complex input 1. + * @param[in,out] c2 Complex input 2. + * @param[in,out] c3 Complex input 3. + * @param[in,out] c4 Complex input 4. + * @param[in,out] c5 Complex input 5. + * @param[in,out] c6 Complex input 6. + */ +#define DFT_7(c0, c1, c2, c3, c4, c5, c6) \ + { \ + float2 v0, v1, v2, v3, v4, v5, v6; \ + v0 = c0; \ + v1 = W7_A * (c1 + c6) - W7_C * (c2 + c5) - W7_E * (c3 + c4); \ + v2 = W7_C * (c1 + c6) + W7_E * (c2 + c5) - W7_A * (c3 + c4); \ + v3 = W7_E * (c1 + c6) - W7_A * (c2 + c5) + W7_C * (c3 + c4); \ + v4 = W7_B * (c1 - c6) + W7_D * (c2 - c5) + W7_F * (c3 - c4); \ + v5 = W7_D * (c1 - c6) - W7_F * (c2 - c5) - W7_B * (c3 - c4); \ + v6 = W7_F * (c1 - c6) - W7_B * (c2 - c5) + W7_D * (c3 - c4); \ + c0 = v0 + c1 + c2 + c3 + c4 + c5 + c6; \ + c1 = v0 + v1 + (float2)(v4.y, -v4.x); \ + c2 = v0 - v2 + (float2)(v5.y, -v5.x); \ + c3 = v0 - v3 + (float2)(v6.y, -v6.x); \ + c4 = v0 - v3 + (float2)(-v6.y, v6.x); \ + c5 = v0 - v2 + (float2)(-v5.y, v5.x); \ + c6 = v0 + v1 + (float2)(-v4.y, v4.x); \ + } + +/** Computes radix-8 butterfly unit. + * + * @param[in,out] c0 Complex input 0. + * @param[in,out] c1 Complex input 1. + * @param[in,out] c2 Complex input 2. + * @param[in,out] c3 Complex input 3. + * @param[in,out] c4 Complex input 4. + * @param[in,out] c5 Complex input 5. + * @param[in,out] c6 Complex input 6. + * @param[in,out] c7 Complex input 7. + */ +#define DFT_8(c0, c1, c2, c3, c4, c5, c6, c7) \ + { \ + float2 v0, v1, v2, v3, v4, v5, v6, v7; \ + float2 s0, s1, s2, s3, s4, s5, s6, s7; \ + float2 t0, t1, t2; \ + v0 = c0 + c4; \ + v1 = c1 + c5; \ + v2 = c2 + c6; \ + v3 = c3 + c7; \ + v4 = c0 - c4; \ + v5 = c1 - c5; \ + v6 = c2 - c6; \ + v7 = c3 - c7; \ + s0 = v0 + v2; \ + s1 = v1 + v3; \ + s2 = v0 - v2; \ + s3 = v1 - v3; \ + s4.x = v4.x - v6.y; \ + s4.y = v4.y + v6.x; \ + s5.x = v5.x - v7.y; \ + s5.y = v5.y + v7.x; \ + s6.x = v4.x + v6.y; \ + s6.y = v4.y - v6.x; \ + s7.x = v5.x + v7.y; \ + s7.y = v5.y - v7.x; \ + t0.x = -s3.y; \ + t0.y = s3.x; \ + t1.x = M_SQRT1_2_F * (s5.x - s5.y); \ + t1.y = M_SQRT1_2_F * (s5.x + s5.y); \ + t2.x = -M_SQRT1_2_F * (s7.x + s7.y); \ + t2.y = M_SQRT1_2_F * (s7.x - s7.y); \ + c0 = s0 + s1; \ + c1 = s6 - t2; \ + c2 = s2 - t0; \ + c3 = s4 - t1; \ + c4 = s0 - s1; \ + c5 = s6 + t2; \ + c6 = s2 + t0; \ + c7 = s4 + t1; \ + } + +/** Computes the first stage of a radix-2 DFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +kernel void fft_radix_2_first_stage_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load eight complex input values + float4 data = vload4(0, (__global float *)input.ptr); + + // Compute DFT N = 2 + DFT_2(data.s01, data.s23); + + // Store eight complex output values + vstore4(data, 0, (__global float *)output.ptr); +} + +/** Computes the first stage of a radix-3 DFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +kernel void fft_radix_3_first_stage_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load eight complex input values + float4 data0 = vload4(0, (__global float *)input.ptr); + float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 2, 0, 0)); + + // Compute DFT N = 3 + DFT_3(data0.s01, data0.s23, data1.s01); + + // Store eight complex output values + vstore4(data0, 0, (__global float *)output.ptr); + vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 2, 0, 0)); +} + +/** Computes the first stage of a radix-4 DFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +kernel void fft_radix_4_first_stage_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load eight complex input values + float8 data = vload8(0, (__global float *)input.ptr); + + // Compute DFT N = 4 + DFT_4(data.s01, data.s23, data.s45, data.s67); + + // Store eight complex output values + vstore8(data, 0, (__global float *)output.ptr); +} + +/** Computes the first stage of a radix-5 DFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +kernel void fft_radix_5_first_stage_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load eight complex input values + float8 data0 = vload8(0, (__global float *)input.ptr); + float2 data1 = vload2(0, (__global float *)tensor3D_offset(&input, 4, 0, 0)); + + // Compute DFT N = 5 + DFT_5(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01); + + // Store eight complex output values + vstore8(data0, 0, (__global float *)output.ptr); + vstore2(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0)); +} + +/** Computes the first stage of a radix-7 DFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +kernel void fft_radix_7_first_stage_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load eight complex input values + float8 data0 = vload8(0, (__global float *)input.ptr); + float4 data1 = vload4(0, (__global float *)tensor3D_offset(&input, 4, 0, 0)); + float2 data2 = vload2(0, (__global float *)tensor3D_offset(&input, 6, 0, 0)); + + // Compute DFT N = 7 + DFT_7(data0.s01, data0.s23, data0.s45, data0.s67, data1.s01, data1.s23, data2.s01); + + // Store eight complex output values + vstore8(data0, 0, (__global float *)output.ptr); + vstore4(data1, 0, (__global float *)tensor3D_offset(&output, 4, 0, 0)); + vstore2(data2, 0, (__global float *)tensor3D_offset(&output, 6, 0, 0)); +} + +/** Computes the first stage of a radix-8 DFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +kernel void fft_radix_8_first_stage_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ +) +{ + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + + // Load eight complex input values + float16 data = vload16(0, (__global float *)input.ptr); + + // Compute DFT N = 8 + DFT_8(data.s01, data.s23, data.s45, data.s67, data.s89, data.sAB, data.sCD, data.sEF); + + // Store eight complex output values + vstore16(data, 0, (__global float *)output.ptr); +} + +/** Computes a stage of a radix-2 FFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage + * @param[in] Ni Nx * Ny. + * @param[in] exp_const Exponent constant + */ +kernel void fft_radix_2_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ + , + uint Nx, uint Ni, float exp_const) +{ + // Each work-item computes a single radix-2 + uint kx = get_global_id(0); + + // Compute nx + uint nx = kx % Nx; + + // Compute n index + uint n = nx + (kx / Nx) * Ni; + + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); + input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z; +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z; +#endif /* IN_PLACE */ + + // Load two complex input values + float2 c0 = vload2(0, (__global float *)input.ptr); + float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0)); + + // Compute phi + float phi = (float)nx * exp_const; + + // Multiply by twiddle factor + TWIDDLE_FACTOR_MULTIPLICATION(phi, c1); + + // Compute DFT N = 2 + DFT_2(c0, c1); + + // Store two complex output values + vstore2(c0, 0, (__global float *)output.ptr); + vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0)); +} + +/** Computes a stage of a radix-3 FFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage + * @param[in] Ni Nx * Ny. + * @param[in] exp_const Exponent constant + */ +kernel void fft_radix_3_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ + , + uint Nx, uint Ni, float exp_const) +{ + // Each work-item computes a single radix-3 + uint kx = get_global_id(0); + + // Compute nx + uint nx = kx % Nx; + + // Compute n index + uint n = nx + (kx / Nx) * Ni; + + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); + input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z; +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z; +#endif /* IN_PLACE */ + + // Load three complex input values + float2 c0 = vload2(0, (__global float *)input.ptr); + float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0)); + float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0)); + + // Compute phi + float phi = (float)nx * exp_const; + + // Multiply by twiddle factor + TWIDDLE_FACTOR_MULTIPLICATION(phi, c1); + TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2); + + // Compute DFT N = 3 + DFT_3(c0, c1, c2); + + // Store three complex output values + vstore2(c0, 0, (__global float *)output.ptr); + vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0)); + vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0)); +} + +/** Computes a stage of a radix-4 FFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage + * @param[in] Ni Nx * Ny. + * @param[in] exp_const Exponent constant + */ +kernel void fft_radix_4_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ + , + uint Nx, uint Ni, float exp_const) +{ + // Each work-item computes a single radix-4 + uint kx = get_global_id(0); + + // Compute nx + uint nx = kx % Nx; + + // Compute n index + uint n = nx + (kx / Nx) * Ni; + + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); + input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z; +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z; +#endif /* IN_PLACE */ + + // Load four complex input values + float2 c0 = vload2(0, (__global float *)input.ptr); + float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0)); + float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0)); + float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0)); + + // Compute phi + float phi = (float)nx * exp_const; + + // Multiply by twiddle factor + TWIDDLE_FACTOR_MULTIPLICATION(phi, c1); + TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2); + TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3); + + // Compute DFT N = 4 + DFT_4(c0, c1, c2, c3); + + // Store four complex output values + vstore2(c0, 0, (__global float *)output.ptr); + vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0)); + vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0)); + vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0)); +} + +/** Computes a stage of a radix-5 FFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage + * @param[in] Ni Nx * Ny. + * @param[in] exp_const Exponent constant + */ +kernel void fft_radix_5_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ + , + uint Nx, uint Ni, float exp_const) +{ + // Each work-item computes a single radix-5 + uint kx = get_global_id(0); + + // Compute nx + uint nx = kx % Nx; + + // Compute n index + uint n = nx + (kx / Nx) * Ni; + + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); + input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z; +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z; +#endif /* IN_PLACE */ + + // Load five complex input values + float2 c0 = vload2(0, (__global float *)input.ptr); + float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0)); + float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0)); + float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0)); + float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0)); + + // Compute phi + float phi = (float)nx * exp_const; + + // Multiply by twiddle factor + TWIDDLE_FACTOR_MULTIPLICATION(phi, c1); + TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2); + TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3); + TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4); + + // Compute DFT N = 5 + DFT_5(c0, c1, c2, c3, c4); + + // Store five complex output values + vstore2(c0, 0, (__global float *)output.ptr); + vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0)); + vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0)); + vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0)); + vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0)); +} + +/** Computes a stage of a radix-7 FFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage + * @param[in] Ni Nx * Ny. + * @param[in] exp_const Exponent constant + */ +kernel void fft_radix_7_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ + , + uint Nx, uint Ni, float exp_const) +{ + // Each work-item computes a single radix-7 + uint kx = get_global_id(0); + + // Compute nx + uint nx = kx % Nx; + + // Compute n index + uint n = nx + (kx / Nx) * Ni; + + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); + input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z; +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z; +#endif /* IN_PLACE */ + + // Load seven complex input values + float2 c0 = vload2(0, (__global float *)input.ptr); + float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0)); + float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0)); + float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0)); + float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0)); + float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0)); + float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0)); + + // Compute phi + float phi = (float)nx * exp_const; + + // Multiply by twiddle factor + TWIDDLE_FACTOR_MULTIPLICATION(phi, c1); + TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2); + TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3); + TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4); + TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5); + TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6); + + // Compute DFT N = 7 + DFT_7(c0, c1, c2, c3, c4, c5, c6); + + // Store seven complex output values + vstore2(c0, 0, (__global float *)output.ptr); + vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0)); + vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0)); + vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0)); + vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0)); + vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0)); + vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0)); +} + +/** Computes a stage of a radix-8 FFT. + * + * @note In order to perform the FFT function "in-place", the pre-processor -DIN_PLACE must be passed at compile time + * + * @param[in,out] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in,out] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in,out] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in,out] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in,out] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in,out] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in,out] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in,out] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] Nx The butterfly span. Products of radix order of previous radix's stage + * @param[in] Ni Nx * Ny. + * @param[in] exp_const Exponent constant + */ +kernel void fft_radix_8_axis_0( + TENSOR3D_DECLARATION(input) +#ifndef IN_PLACE + , + TENSOR3D_DECLARATION(output) +#endif /* not IN_PLACE */ + , + uint Nx, uint Ni, float exp_const) +{ + // Each work-item computes a single radix-8 + uint kx = get_global_id(0); + + // Compute nx + uint nx = kx % Nx; + + // Compute n index + uint n = nx + (kx / Nx) * Ni; + + // Get tensor pointers + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(input); + input.ptr += n * input.stride_x + get_global_id(1) * input.stride_y + get_global_id(2) * input.stride_z; +#ifdef IN_PLACE + Tensor3D output = input; +#else /* IN_PLACE */ + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT_NO_STEP(output); + output.ptr += n * output.stride_x + get_global_id(1) * output.stride_y + get_global_id(2) * output.stride_z; +#endif /* IN_PLACE */ + + // Load eight complex input values + float2 c0 = vload2(0, (__global float *)input.ptr); + float2 c1 = vload2(0, (__global float *)tensor3D_offset(&input, Nx, 0, 0)); + float2 c2 = vload2(0, (__global float *)tensor3D_offset(&input, 2 * Nx, 0, 0)); + float2 c3 = vload2(0, (__global float *)tensor3D_offset(&input, 3 * Nx, 0, 0)); + float2 c4 = vload2(0, (__global float *)tensor3D_offset(&input, 4 * Nx, 0, 0)); + float2 c5 = vload2(0, (__global float *)tensor3D_offset(&input, 5 * Nx, 0, 0)); + float2 c6 = vload2(0, (__global float *)tensor3D_offset(&input, 6 * Nx, 0, 0)); + float2 c7 = vload2(0, (__global float *)tensor3D_offset(&input, 7 * Nx, 0, 0)); + + // Compute phi + float phi = (float)nx * exp_const; + + // Multiply by twiddle factor + TWIDDLE_FACTOR_MULTIPLICATION(phi, c1); + TWIDDLE_FACTOR_MULTIPLICATION(2 * phi, c2); + TWIDDLE_FACTOR_MULTIPLICATION(3 * phi, c3); + TWIDDLE_FACTOR_MULTIPLICATION(4 * phi, c4); + TWIDDLE_FACTOR_MULTIPLICATION(5 * phi, c5); + TWIDDLE_FACTOR_MULTIPLICATION(6 * phi, c6); + TWIDDLE_FACTOR_MULTIPLICATION(7 * phi, c7); + + // Compute DFT N = 8 + DFT_8(c0, c1, c2, c3, c4, c5, c6, c7); + + // Store eight complex output values + vstore2(c0, 0, (__global float *)output.ptr); + vstore2(c1, 0, (__global float *)tensor3D_offset(&output, Nx, 0, 0)); + vstore2(c2, 0, (__global float *)tensor3D_offset(&output, 2 * Nx, 0, 0)); + vstore2(c3, 0, (__global float *)tensor3D_offset(&output, 3 * Nx, 0, 0)); + vstore2(c4, 0, (__global float *)tensor3D_offset(&output, 4 * Nx, 0, 0)); + vstore2(c5, 0, (__global float *)tensor3D_offset(&output, 5 * Nx, 0, 0)); + vstore2(c6, 0, (__global float *)tensor3D_offset(&output, 6 * Nx, 0, 0)); + vstore2(c7, 0, (__global float *)tensor3D_offset(&output, 7 * Nx, 0, 0)); +} \ No newline at end of file diff --git a/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp new file mode 100644 index 0000000000..d72647c3c9 --- /dev/null +++ b/src/core/CL/kernels/CLFFTDigitReverseKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLFFTDigitReverseKernel.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(idx, 1, DataType::U32); + ARM_COMPUTE_RETURN_ERROR_ON(axis != 0); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, ITensorInfo *idx, unsigned int axis) +{ + ARM_COMPUTE_UNUSED(idx, axis); + + auto_init_if_empty(*output, *input); + + Window win = calculate_max_window(*output, Steps()); + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + + return std::make_pair(Status{}, win); +} +} // namespace + +CLFFTDigitReverseKernel::CLFFTDigitReverseKernel() + : _input(nullptr), _output(nullptr), _idx(nullptr) +{ +} + +void CLFFTDigitReverseKernel::configure(const ICLTensor *input, ICLTensor *output, const ICLTensor *idx, unsigned int axis) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, idx); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), idx->info(), axis)); + + _input = input; + _output = output; + _idx = idx; + + // Create kernel + _kernel = static_cast(CLKernelLibrary::get().create_kernel("digit_reverse")); + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), output->info(), idx->info(), axis); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "digit_reverse_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLFFTDigitReverseKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const ITensorInfo *idx, unsigned int axis) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, idx, axis)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), idx->clone().get(), axis).first); + + return Status{}; +} + +void CLFFTDigitReverseKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + add_3D_tensor_argument(idx, _output, slice); + add_1D_tensor_argument(idx, _idx, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace arm_compute diff --git a/src/core/CL/kernels/CLFFTRadixStageKernel.cpp b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp new file mode 100644 index 0000000000..87a12b9da9 --- /dev/null +++ b/src/core/CL/kernels/CLFFTRadixStageKernel.cpp @@ -0,0 +1,163 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/CL/kernels/CLFFTRadixStageKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/CLValidate.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Window.h" + +#include + +namespace arm_compute +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelDescriptor &config) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(config.axis != 0); + ARM_COMPUTE_RETURN_ERROR_ON(CLFFTRadixStageKernel::supported_radix().count(config.radix) == 0); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const FFTRadixStageKernelDescriptor &config) +{ + if(output != nullptr) + { + auto_init_if_empty(*output, *input); + } + + Window win = calculate_max_window(*input, Steps(config.radix)); + if(output != nullptr) + { + output->set_valid_region(ValidRegion(Coordinates(), output->tensor_shape())); + } + + return std::make_pair(Status{}, win); +} +} // namespace + +CLFFTRadixStageKernel::CLFFTRadixStageKernel() + : _input(nullptr), _output(nullptr), _run_in_place(false) +{ +} + +void CLFFTRadixStageKernel::configure(ICLTensor *input, ICLTensor *output, const FFTRadixStageKernelDescriptor &config) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (output != nullptr) ? output->info() : nullptr, config)); + + _input = input; + _output = output; + _run_in_place = (output == nullptr) || (output == input); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); + + // Create kernel + std::string kernel_name = "fft"; + kernel_name += "_radix_" + support::cpp11::to_string(config.radix); + kernel_name += (config.is_first_stage) ? "_first_stage" : ""; + kernel_name += "_axis_" + support::cpp11::to_string(config.axis); + _kernel = static_cast(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + + // Set static arguments if not the first stage + if(!config.is_first_stage) + { + const unsigned int Ni = config.Nx * config.radix; + const float exp_const = (-2.0 * M_PI) / static_cast(Ni); + unsigned int idx = (1 + (_run_in_place ? 0 : 1)) * num_arguments_per_3D_tensor(); // Skip the input and output parameters + _kernel.setArg(idx++, config.Nx); + _kernel.setArg(idx++, Ni); + _kernel.setArg(idx++, exp_const); + } + + // Configure kernel window + auto win_config = validate_and_configure_window(input->info(), (_run_in_place) ? nullptr : output->info(), config); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(input->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(input->info()->dimension(1)); +} + +Status CLFFTRadixStageKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const FFTRadixStageKernelDescriptor &config) +{ + const bool run_in_place = (output == nullptr) || (output == input); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, config)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), + (run_in_place) ? nullptr : output->clone().get(), + config) + .first); + + return Status{}; +} + +std::set CLFFTRadixStageKernel::supported_radix() +{ + return std::set { 2, 3, 4, 5, 7, 8 }; +} + +void CLFFTRadixStageKernel::run(const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, _input, slice); + if(!_run_in_place) + { + add_3D_tensor_argument(idx, _output, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace arm_compute diff --git a/src/core/utils/helpers/fft.cpp b/src/core/utils/helpers/fft.cpp new file mode 100644 index 0000000000..7ff2fdf62b --- /dev/null +++ b/src/core/utils/helpers/fft.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/utils/helpers/fft.h" + +#include + +namespace arm_compute +{ +namespace helpers +{ +namespace fft +{ +std::vector decompose_stages(unsigned int N, const std::set &supported_factors) +{ + std::vector stages; + unsigned int res = N; + + // Early exit if no supported factors are provided + if(supported_factors.empty()) + { + return stages; + } + + // Create reverse iterator (Start decomposing from the larger supported factors) + auto rfactor_it = supported_factors.rbegin(); + + // Decomposition step + while(res != 0) + { + const unsigned int factor = *rfactor_it; + if(0 == (res % factor) && res >= factor) + { + stages.push_back(factor); + res /= factor; + } + else + { + ++rfactor_it; + if(rfactor_it == supported_factors.rend()) + { + if(res > 1) + { + // Couldn't decompose with given factors + stages.clear(); + return stages; + } + else + { + res = 0; + } + } + } + } + + return stages; +} + +std::vector digit_reverse_indices(unsigned int N, const std::vector &fft_stages) +{ + std::vector idx_digit_reverse; + + // Early exit in case N and fft stages do not match + const float stages_prod = std::accumulate(std::begin(fft_stages), std::end(fft_stages), 1, std::multiplies()); + if(stages_prod != N) + { + return idx_digit_reverse; + } + + // Resize digit reverse vector + idx_digit_reverse.resize(N); + + // Get number of radix stages + unsigned int n_stages = fft_stages.size(); + + // Scan elements + for(unsigned int n = 0; n < N; ++n) + { + unsigned int k = n; + unsigned int Nx = fft_stages[0]; + + // Scan stages + for(unsigned int s = 1; s < n_stages; ++s) + { + // radix of stage i-th + unsigned int Ny = fft_stages[s]; + unsigned int Ni = Ny * Nx; + + // Update k index + k = (k * Ny) % Ni + (k / Nx) % Ny + Ni * (k / Ni); + + // Update Nx + Nx *= Ny; + } + + // K is the index of digit-reverse + idx_digit_reverse[n] = k; + } + + return idx_digit_reverse; +} +} // namespace fft +} // namespace helpers +} // namespace arm_compute diff --git a/src/runtime/CL/functions/CLFFT1D.cpp b/src/runtime/CL/functions/CLFFT1D.cpp new file mode 100644 index 0000000000..6b6735ae58 --- /dev/null +++ b/src/runtime/CL/functions/CLFFT1D.cpp @@ -0,0 +1,119 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/runtime/CL/functions/CLFFT1D.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/helpers/fft.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +namespace arm_compute +{ +CLFFT1D::CLFFT1D(std::shared_ptr memory_manager) + : _memory_group(std::move(memory_manager)), _digit_reversed_input(), _digit_reverse_indices(), _digit_reverse_kernel(), _fft_kernels(), _num_ffts(0) +{ +} + +void CLFFT1D::configure(const ICLTensor *input, ICLTensor *output, const FFT1DInfo &config) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_THROW_ON(CLFFT1D::validate(input->info(), output->info(), config)); + + // Decompose size to radix factors + const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); + const unsigned int N = input->info()->tensor_shape()[config.axis]; + const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix); + ARM_COMPUTE_ERROR_ON(decomposed_vector.empty()); + + // Configure digit reverse + TensorInfo digit_reverse_indices_info(TensorShape(input->info()->tensor_shape()[config.axis]), 1, DataType::U32); + _digit_reverse_indices.allocator()->init(digit_reverse_indices_info); + _memory_group.manage(&_digit_reversed_input); + _digit_reverse_kernel.configure(input, &_digit_reversed_input, &_digit_reverse_indices, config.axis); + + // Create and configure FFT kernels + unsigned int Nx = 1; + _num_ffts = decomposed_vector.size(); + _fft_kernels = arm_compute::support::cpp14::make_unique(_num_ffts); + for(unsigned int i = 0; i < _num_ffts; ++i) + { + const unsigned int radix_for_stage = decomposed_vector.at(i); + + FFTRadixStageKernelDescriptor fft_kernel_desc; + fft_kernel_desc.axis = config.axis; + fft_kernel_desc.radix = radix_for_stage; + fft_kernel_desc.Nx = Nx; + fft_kernel_desc.is_first_stage = (i == 0); + _fft_kernels[i].configure(&_digit_reversed_input, i == (_num_ffts - 1) ? output : nullptr, fft_kernel_desc); + + Nx *= radix_for_stage; + } + + // Allocate tensors + _digit_reversed_input.allocator()->allocate(); + _digit_reverse_indices.allocator()->allocate(); + + // Init digit reverse indices + const auto digit_reverse_cpu = arm_compute::helpers::fft::digit_reverse_indices(N, decomposed_vector); + _digit_reverse_indices.map(CLScheduler::get().queue(), true); + std::copy_n(digit_reverse_cpu.data(), N, reinterpret_cast(_digit_reverse_indices.buffer())); + _digit_reverse_indices.unmap(CLScheduler::get().queue()); +} + +Status CLFFT1D::validate(const ITensorInfo *input, const ITensorInfo *output, const FFT1DInfo &config) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 2, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(config.axis != 0); + + // Check if FFT is decomposable + const auto supported_radix = CLFFTRadixStageKernel::supported_radix(); + const unsigned int N = input->tensor_shape()[config.axis]; + const auto decomposed_vector = arm_compute::helpers::fft::decompose_stages(N, supported_radix); + ARM_COMPUTE_RETURN_ERROR_ON(decomposed_vector.empty()); + + // Checks performed when output is configured + if((output != nullptr) && (output->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +void CLFFT1D::run() +{ + _memory_group.acquire(); + + CLScheduler::get().enqueue(_digit_reverse_kernel, false); + + for(unsigned int i = 0; i < _num_ffts; ++i) + { + CLScheduler::get().enqueue(_fft_kernels[i], i == (_num_ffts - 1)); + } + + _memory_group.release(); +} +} // namespace arm_compute diff --git a/tests/benchmark/CL/FFT.cpp b/tests/benchmark/CL/FFT.cpp new file mode 100644 index 0000000000..b345d58eaf --- /dev/null +++ b/tests/benchmark/CL/FFT.cpp @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLFFT1D.h" +#include "tests/CL/CLAccessor.h" +#include "tests/benchmark/fixtures/FFTFixture.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace test +{ +namespace benchmark +{ +namespace +{ +const auto data_types = framework::dataset::make("DataType", { DataType::F32 }); +const auto shapes = framework::dataset::make("Shapes", { TensorShape(192U, 128U, 64U), TensorShape(224U, 224U) }); +} // namespace + +using CLFFT1DFixture = FFT1DFixture; + +TEST_SUITE(CL) + +REGISTER_FIXTURE_DATA_TEST_CASE(FFT1D, CLFFT1DFixture, framework::DatasetMode::ALL, + framework::dataset::combine(shapes, data_types)); + +TEST_SUITE_END() // CL +} // namespace benchmark +} // namespace test +} // namespace arm_compute diff --git a/tests/benchmark/fixtures/FFTFixture.h b/tests/benchmark/fixtures/FFTFixture.h new file mode 100644 index 0000000000..c9c4e3a88e --- /dev/null +++ b/tests/benchmark/fixtures/FFTFixture.h @@ -0,0 +1,83 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_TEST_FFT_FIXTURE +#define ARM_COMPUTE_TEST_FFT_FIXTURE + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "tests/Globals.h" +#include "tests/Utils.h" +#include "tests/framework/Fixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace benchmark +{ +template +class FFT1DFixture : public framework::Fixture +{ +public: + template + void setup(TensorShape shape, DataType data_type) + { + // Create tensors + src = create_tensor(shape, data_type, 2); + dst = create_tensor(shape, data_type, 2); + + // Create and configure function + fft_func.configure(&src, &dst, FFT1DInfo()); + + // Allocate tensors + src.allocator()->allocate(); + dst.allocator()->allocate(); + } + + void run() + { + fft_func.run(); + } + + void sync() + { + sync_if_necessary(); + sync_tensor_if_necessary(dst); + } + + void teardown() + { + src.allocator()->free(); + dst.allocator()->free(); + } + +private: + TensorType src{}; + TensorType dst{}; + Function fft_func{}; +}; +} // namespace benchmark +} // namespace test +} // namespace arm_compute +#endif /* ARM_COMPUTE_TEST_FFT_FIXTURE */ diff --git a/tests/validation/CL/FFT.cpp b/tests/validation/CL/FFT.cpp new file mode 100644 index 0000000000..0d29532c29 --- /dev/null +++ b/tests/validation/CL/FFT.cpp @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/functions/CLFFT1D.h" +#include "tests/CL/CLAccessor.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Macros.h" +#include "tests/framework/datasets/Datasets.h" +#include "tests/validation/Validation.h" +#include "tests/validation/fixtures/FFTFixture.h" + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +namespace +{ +const auto data_types = framework::dataset::make("DataType", { DataType::F32 }); +const auto shapes = framework::dataset::make("TensorShape", { TensorShape(2U, 2U, 3U), TensorShape(3U, 2U, 3U), + TensorShape(4U, 2U, 3U), TensorShape(5U, 2U, 3U), + TensorShape(7U, 2U, 3U), TensorShape(8U, 2U, 3U), + TensorShape(9U, 2U, 3U), TensorShape(25U, 2U, 3U), + TensorShape(49U, 2U, 3U), TensorShape(64U, 2U, 3U), + TensorShape(16U, 2U, 3U), TensorShape(32U, 2U, 3U), + TensorShape(96U, 2U, 2U) + }); +} // namespace +TEST_SUITE(CL) +TEST_SUITE(FFT1D) + +DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(shapes, data_types), + shape, data_type) +{ + // Create tensors + CLTensor src = create_tensor(shape, data_type, 2); + CLTensor dst = create_tensor(shape, data_type, 2); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Create and configure function + CLFFT1D fft1d; + fft1d.configure(&src, &dst, FFT1DInfo()); + + // Validate valid region + const ValidRegion valid_region = shape_to_valid_region(shape); + validate(src.info()->valid_region(), valid_region); + validate(dst.info()->valid_region(), valid_region); + + // Validate padding + validate(src.info()->padding(), PaddingSize()); + validate(dst.info()->padding(), PaddingSize()); +} + +// *INDENT-OFF* +// clang-format off +DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip( + framework::dataset::make("InputInfo", { TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Mismatching data types + TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Mismatching shapes + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), // Invalid channels + TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), // Unsupported axis + TensorInfo(TensorShape(11U, 13U, 2U), 2, DataType::F32), // Undecomposable FFT + TensorInfo(TensorShape(25U, 13U, 2U), 2, DataType::F32), + }), + framework::dataset::make("OutputInfo",{ TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F16), + TensorInfo(TensorShape(16U, 13U, 2U), 2, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 1, DataType::F32), + TensorInfo(TensorShape(32U, 13U, 2U), 2, DataType::F32), + TensorInfo(TensorShape(11U, 13U, 2U), 2, DataType::F32), + TensorInfo(TensorShape(25U, 13U, 2U), 2, DataType::F32), + })), + framework::dataset::make("Axis", { 0, 0, 0, 1, 0, 0 })), + framework::dataset::make("Expected", { false, false, false, false, false, true })), + input_info, output_info, axis, expected) +{ + FFT1DInfo desc; + desc.axis = axis; + const Status s = CLFFT1D::validate(&input_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), desc); + ARM_COMPUTE_EXPECT(bool(s) == expected, framework::LogLevel::ERRORS); +} +// clang-format on +// *INDENT-ON* + +template +using CLFFT1DFixture = FFTValidationFixture; + +TEST_SUITE(Float) +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmall, CLFFT1DFixture, framework::DatasetMode::ALL, combine(shapes, framework::dataset::make("DataType", DataType::F32))) +{ + // Validate output + validate(CLAccessor(_target), _reference, RelativeTolerance(0.1f), 0.05f); +} +TEST_SUITE_END() // FP32 +TEST_SUITE_END() // Float + +TEST_SUITE_END() // FFT1D +TEST_SUITE_END() // CL +} // namespace validation +} // namespace test +} // namespace arm_compute diff --git a/tests/validation/fixtures/FFTFixture.h b/tests/validation/fixtures/FFTFixture.h new file mode 100644 index 0000000000..8e3c01eaff --- /dev/null +++ b/tests/validation/fixtures/FFTFixture.h @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2019 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_TEST_FFT_FIXTURE +#define ARM_COMPUTE_TEST_FFT_FIXTURE + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "tests/AssetsLibrary.h" +#include "tests/Globals.h" +#include "tests/IAccessor.h" +#include "tests/framework/Asserts.h" +#include "tests/framework/Fixture.h" +#include "tests/validation/reference/DFT.h" + +#include + +namespace arm_compute +{ +namespace test +{ +namespace validation +{ +template +class FFTValidationFixture : public framework::Fixture +{ +public: + template + void setup(TensorShape shape, DataType data_type) + { + _target = compute_target(shape, data_type); + _reference = compute_reference(shape, data_type); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(_target.info()->tensor_shape(), _reference.shape()); + } + +protected: + template + void fill(U &&tensor) + { + std::uniform_real_distribution distribution(-5.f, 5.f); + library->fill(tensor, distribution, 0); + } + + TensorType compute_target(const TensorShape &shape, DataType data_type) + { + // Create tensors + TensorType src = create_tensor(shape, data_type, 2); + TensorType dst = create_tensor(shape, data_type, 2); + + // Create and configure function + FunctionType fft1d; + fft1d.configure(&src, &dst, FFT1DInfo()); + + ARM_COMPUTE_EXPECT(src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Allocate tensors + src.allocator()->allocate(); + dst.allocator()->allocate(); + + ARM_COMPUTE_EXPECT(!src.info()->is_resizable(), framework::LogLevel::ERRORS); + ARM_COMPUTE_EXPECT(!dst.info()->is_resizable(), framework::LogLevel::ERRORS); + + // Fill tensors + fill(AccessorType(src)); + + // Compute function + fft1d.run(); + + return dst; + } + + SimpleTensor compute_reference(const TensorShape &shape, DataType data_type) + { + // Create reference + SimpleTensor src{ shape, data_type, 2 }; + + // Fill reference + fill(src); + + return reference::dft_1d(src, reference::FFTDirection::Forward); + } + + TensorType _target{}; + SimpleTensor _reference{}; +}; +} // namespace validation +} // namespace test +} // namespace arm_compute +#endif /* ARM_COMPUTE_TEST_FFT_FIXTURE */ -- cgit v1.2.1