aboutsummaryrefslogtreecommitdiff
path: root/src/cpu/kernels
diff options
context:
space:
mode:
Diffstat (limited to 'src/cpu/kernels')
-rw-r--r--src/cpu/kernels/CpuPool3dKernel.cpp176
-rw-r--r--src/cpu/kernels/CpuPool3dKernel.h90
-rw-r--r--src/cpu/kernels/pool3d/list.h42
-rw-r--r--src/cpu/kernels/pool3d/neon/fp16.cpp36
-rw-r--r--src/cpu/kernels/pool3d/neon/fp32.cpp34
-rw-r--r--src/cpu/kernels/pool3d/neon/impl.cpp460
-rw-r--r--src/cpu/kernels/pool3d/neon/impl.h42
7 files changed, 880 insertions, 0 deletions
diff --git a/src/cpu/kernels/CpuPool3dKernel.cpp b/src/cpu/kernels/CpuPool3dKernel.cpp
new file mode 100644
index 0000000000..3321967d2f
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.cpp
@@ -0,0 +1,176 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/CpuPool3dKernel.h"
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "src/core/CPP/Validate.h"
+#include "src/core/common/Registrars.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+#include "src/cpu/kernels/pool3d/list.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+using namespace misc::shape_calculator;
+
+static const std::vector<CpuPool3dKernel::Pooling3dKernel> available_kernels =
+{
+ {
+ "neon_fp16_ndhwc_poolMxNxD",
+ [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F16 && data.isa.fp16); },
+ REGISTER_FP16_NEON(arm_compute::cpu::neon_fp16_pool3d)
+ },
+
+ {
+ "neon_fp32_ndhwc_poolMxNxD",
+ [](const DataTypeISASelectorData & data) { return (data.dt == DataType::F32); },
+ REGISTER_FP32_NEON(arm_compute::cpu::neon_fp32_pool3d)
+ }
+};
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported");
+ ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32);
+
+ const auto data_layout = src->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+ const bool is_global_pooling = pool_info.is_global_pooling;
+ const unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width;
+ const unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height;
+ const unsigned int pool_size_z = is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth;
+
+ const unsigned int stride_x = pool_info.stride.x();
+ const unsigned int stride_y = pool_info.stride.y();
+ const unsigned int stride_z = pool_info.stride.z();
+
+ ARM_COMPUTE_RETURN_ERROR_ON((pool_size_x == 0) || (pool_size_y == 0) || (pool_size_z == 0));
+ ARM_COMPUTE_RETURN_ERROR_ON((stride_x == 0) || (stride_y == 0) || (stride_z == 0));
+
+ int output_width = 0;
+ int output_height = 0;
+ int output_depth = 0;
+
+ std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], src->tensor_shape()[idx_depth],
+ pool_size_x, pool_size_y, pool_size_z, pool_info);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid");
+
+ if(dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst);
+ TensorInfo out_info(TensorInfo(compute_pool3d_shape(src->tensor_shape(), pool_info), 1, dst->data_type(), DataLayout::NDHWC));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info);
+ }
+
+ const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ ARM_COMPUTE_RETURN_ERROR_ON(uk == nullptr || uk->ukernel == nullptr);
+
+ return Status{};
+}
+} //namespace
+
+void CpuPool3dKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info));
+
+ // dst auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_pool3d_shape(src->tensor_shape(), pool_info)));
+
+ // Get data layout
+ const auto data_layout = src->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_depth = get_data_layout_dimension_index(data_layout, DataLayoutDimension::DEPTH);
+
+ // Update pool size in case of global pooling
+ const bool is_global_pooling = pool_info.is_global_pooling;
+ const Size3D pool_size(
+ is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width,
+ is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height,
+ is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth);
+
+ const auto *uk = CpuPool3dKernel::get_implementation(DataTypeISASelectorData{ src->data_type(), CPUInfo::get().get_isa() });
+ ARM_COMPUTE_ERROR_ON(uk == nullptr);
+
+ // Set instance variables
+ _pool_info = pool_info;
+ _run_method = uk->ukernel;
+ _name = std::string("CpuPool3dKernel").append("/").append(uk->name);
+
+ // Configure kernel window
+ Window win = calculate_max_window(*dst, Steps());
+ ICpuKernel::configure(win);
+}
+
+Status CpuPool3dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src);
+
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info));
+
+ return Status{};
+}
+
+void CpuPool3dKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+ ARM_COMPUTE_ERROR_ON(_run_method == nullptr);
+
+ const ITensor *src = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+ ITensor *dst = tensors.get_tensor(TensorType::ACL_DST_0);
+
+ _run_method(src, dst, _pool_info, window);
+}
+
+const char *CpuPool3dKernel::name() const
+{
+ return _name.c_str();
+}
+
+const std::vector<CpuPool3dKernel::Pooling3dKernel> &CpuPool3dKernel::get_available_kernels()
+{
+ return available_kernels;
+}
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/kernels/CpuPool3dKernel.h b/src/cpu/kernels/CpuPool3dKernel.h
new file mode 100644
index 0000000000..f762cfca9a
--- /dev/null
+++ b/src/cpu/kernels/CpuPool3dKernel.h
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+#define ARM_COMPUTE_CPU_POOL3D_KERNEL_H
+
+#include "arm_compute/core/Types.h"
+#include "src/core/common/Macros.h"
+#include "src/cpu/ICpuKernel.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+/** Interface for the kernel to perform Pooling 3D. */
+class CpuPool3dKernel : public ICpuKernel<CpuPool3dKernel>
+{
+private:
+ /* Template function for Pooling 3D NDHWC */
+ using Pooling3dKernelPtr = std::add_pointer<void(const ITensor *, ITensor *, Pooling3dLayerInfo &, const Window &)>::type;
+
+public:
+ CpuPool3dKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuPool3dKernel);
+ /** Set the src, dst tensor and pooling info.
+ *
+ * Valid data type configurations:
+ * |src |dst |
+ * |:--------------|:--------------|
+ * |F16 |F16 |
+ * |F32 |F32 |
+ *
+ * @param[in] src Source tensor info. Data types supported: F16/F32.
+ * @param[out] dst Destination tensor info. Data types supported: Same as @p src.
+ * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo.
+ */
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuPool3dKernel::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Pooling3dLayerInfo &pool_info);
+
+ // Inherited methods overridden:
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
+
+ struct Pooling3dKernel
+ {
+ const char *name;
+ const DataTypeISASelectorPtr is_selected;
+ Pooling3dKernelPtr ukernel;
+ };
+
+ static const std::vector<Pooling3dKernel> &get_available_kernels();
+
+private:
+ Pooling3dLayerInfo _pool_info{};
+ Pooling3dKernelPtr _run_method{ nullptr };
+ std::string _name{};
+};
+
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+#endif /*ARM_COMPUTE_CPU_POOL3D_KERNEL_H */ \ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/list.h b/src/cpu/kernels/pool3d/list.h
new file mode 100644
index 0000000000..ece780eb0b
--- /dev/null
+++ b/src/cpu/kernels/pool3d/list.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+#define SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H
+
+namespace arm_compute
+{
+namespace cpu
+{
+#define DECLARE_POOLING_KERNEL(func_name) \
+ void func_name(const ITensor *src0, ITensor *dst0, Pooling3dLayerInfo &, const Window &window)
+
+DECLARE_POOLING_KERNEL(neon_fp16_pool3d);
+DECLARE_POOLING_KERNEL(neon_fp32_pool3d);
+
+#undef DECLARE_POOLING_KERNEL
+
+} // namespace cpu
+} // namespace arm_compute
+
+#endif // SRC_CORE_NEON_KERNELS_POOLING3D_LIST_H \ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/fp16.cpp b/src/cpu/kernels/pool3d/neon/fp16.cpp
new file mode 100644
index 0000000000..b79bcd93b5
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp16.cpp
@@ -0,0 +1,36 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp16_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+ return poolingMxNxD_fp_neon_ndhwc<float16_t>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */ \ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/fp32.cpp b/src/cpu/kernels/pool3d/neon/fp32.cpp
new file mode 100644
index 0000000000..2c06a9d57a
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/fp32.cpp
@@ -0,0 +1,34 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+namespace arm_compute
+{
+namespace cpu
+{
+void neon_fp32_pool3d(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+ return poolingMxNxD_fp_neon_ndhwc<float>(src, dst0, pool_info, window);
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/cpu/kernels/pool3d/neon/impl.cpp b/src/cpu/kernels/pool3d/neon/impl.cpp
new file mode 100644
index 0000000000..bb3999b104
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/impl.cpp
@@ -0,0 +1,460 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensor.h"
+#include "arm_compute/core/Types.h"
+#include "arm_compute/core/utils/misc/Traits.h"
+#include "src/core/NEON/wrapper/intrinsics/intrinsics.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+#include "src/cpu/kernels/pool3d/neon/impl.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+inline float calculate_avg_scale(bool exclude_padding, const Coordinates &id, const int pool_size_x, const int pool_size_y, const int pool_size_z, const int upper_bound_w,
+ const int upper_bound_h, const int upper_bound_d, const int pad_x, const int pad_y, const int pad_z, const int stride_x, const int stride_y, const int stride_z)
+{
+ // Based on NDHWC
+ int start_x = id[1] * stride_x - pad_x;
+ int start_y = id[2] * stride_y - pad_y;
+ int start_z = id[3] * stride_z - pad_z;
+
+ const int end_x = std::min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = std::min(start_y + pool_size_y, upper_bound_h);
+ const int end_z = std::min(start_z + pool_size_z, upper_bound_d);
+ if(exclude_padding)
+ {
+ start_x = std::max(0, start_x);
+ start_y = std::max(0, start_y);
+ start_z = std::max(0, start_z);
+ }
+ return 1.f / ((end_y - start_y) * (end_x - start_x) * (end_z - start_z));
+}
+
+
+template <typename T>
+void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out,
+ const int window_start_x, const int window_end_x, const int window_step_x)
+
+{
+ using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+ using vector_type = typename vtype::type;
+ using tag_type = typename vtype::tag_type;
+
+ int pool_stride_x = static_cast<int>(pool_info.stride.width);
+ int pool_stride_y = static_cast<int>(pool_info.stride.height);
+ int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+ const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+ const int pool_pad_top = static_cast<int>(pool_info.padding.top);
+ const int pool_pad_left = static_cast<int>(pool_info.padding.left);
+ const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+
+ const int input_dim_w = src->info()->dimension(1);
+ const int input_dim_h = src->info()->dimension(2);
+ const int input_dim_d = src->info()->dimension(3);
+
+ const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+ const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+ const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+ const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+ const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+ Iterator out(dst0, window_out);
+
+ vector_type vres;
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+ int x_off = window_start_x;
+
+ for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+ {
+ vres = wrapper::vdup_n(static_cast<T>(-std::numeric_limits<float>::infinity()), tag_type());
+ for(int z = pool_start_z; z < pool_end_z; ++z)
+ {
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for(int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for(int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ vres = wrapper::vmax(vres, data);
+ }
+ }
+ }
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+ }
+
+ // Left-overs loop
+ for(; x_off < window_end_x; ++x_off)
+ {
+ T res(0);
+ res = -std::numeric_limits<float>::infinity();
+ for(int z = pool_start_z; z < pool_end_z; ++z)
+ {
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for(int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for(int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ res = std::max(res, data);
+ }
+ }
+ }
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
+ },
+ out);
+}
+
+template <typename T>
+void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
+ const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+{
+ using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+ using vector_type = typename vtype::type;
+ using tag_type = typename vtype::tag_type;
+
+ int pool_stride_x = static_cast<int>(pool_info.stride.width);
+ int pool_stride_y = static_cast<int>(pool_info.stride.height);
+ int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+ const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+ const int pool_pad_top = static_cast<int>(pool_info.padding.top);
+ const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+ const int pool_pad_left = static_cast<int>(pool_info.padding.left);
+ const int pool_pad_right = static_cast<int>(pool_info.padding.right);
+ const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+ const int pool_pad_back = static_cast<int>(pool_info.padding.back);
+
+ const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+ const int input_dim_w = src->info()->dimension(1);
+ const int input_dim_h = src->info()->dimension(2);
+ const int input_dim_d = src->info()->dimension(3);
+
+ const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+ const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+ const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+ const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+ const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+ Iterator out(dst0, window_out);
+
+ vector_type vres;
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+ // Calculate scale
+ const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+ pool_pad_top, pool_pad_front, pool_stride_x,
+ pool_stride_y, pool_stride_z);
+ const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+ int x_off = window_start_x;
+
+ for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+ {
+ // Perform pooling
+ vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+ for(int z = pool_start_z; z < pool_end_z; ++z)
+ {
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for(int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for(int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ vres = wrapper::vadd(vres, data);
+ }
+ }
+ }
+
+ // Divide by scale
+ vres = wrapper::vmul(vres, scale_v);
+
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+ }
+
+ // Left-overs loop
+ for(; x_off < window_end_x; ++x_off)
+ {
+ T res(0);
+
+ for(int z = pool_start_z; z < pool_end_z; ++z)
+ {
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for(int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for(int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ res += data;
+ }
+ }
+ }
+
+ // Divide by scale
+ res *= scale;
+
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
+ },
+ out);
+}
+
+template <typename T>
+void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info,
+ const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x)
+{
+ using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>;
+ using vector_type = typename vtype::type;
+ using tag_type = typename vtype::tag_type;
+
+ int pool_stride_x = static_cast<int>(pool_info.stride.width);
+ int pool_stride_y = static_cast<int>(pool_info.stride.height);
+ int pool_stride_z = static_cast<int>(pool_info.stride.depth);
+
+ const int pool_size_x = pool_info.is_global_pooling ? src->info()->tensor_shape().y() : pool_info.pool_size.width;
+ const int pool_size_y = pool_info.is_global_pooling ? src->info()->tensor_shape().z() : pool_info.pool_size.height;
+ const int pool_size_z = pool_info.is_global_pooling ? src->info()->tensor_shape()[3] : pool_info.pool_size.depth;
+
+ const int pool_pad_top = static_cast<int>(pool_info.padding.top);
+ const int pool_pad_bottom = static_cast<int>(pool_info.padding.bottom);
+ const int pool_pad_left = static_cast<int>(pool_info.padding.left);
+ const int pool_pad_right = static_cast<int>(pool_info.padding.right);
+ const int pool_pad_front = static_cast<int>(pool_info.padding.front);
+ const int pool_pad_back = static_cast<int>(pool_info.padding.back);
+
+ const int upper_bound_w = src->info()->dimension(1) + (pool_info.exclude_padding ? 0 : pool_pad_right);
+ const int upper_bound_h = src->info()->dimension(2) + (pool_info.exclude_padding ? 0 : pool_pad_bottom);
+ const int upper_bound_d = src->info()->dimension(3) + (pool_info.exclude_padding ? 0 : pool_pad_back);
+
+ const int input_dim_w = src->info()->dimension(1);
+ const int input_dim_h = src->info()->dimension(2);
+ const int input_dim_d = src->info()->dimension(3);
+
+ const int y_stride = static_cast<int>(src->info()->strides_in_bytes().y());
+ const int z_stride = static_cast<int>(src->info()->strides_in_bytes().z());
+ const int w_stride = static_cast<int>(src->info()->strides_in_bytes()[3]);
+ const int n_stride = static_cast<int>(src->info()->strides_in_bytes()[4]);
+
+ const uint8_t *in_ptr_start = src->buffer() + src->info()->offset_first_element_in_bytes();
+
+ Iterator out(dst0, window_out);
+
+ vector_type vres;
+ execute_window_loop(window_out, [&](const Coordinates & id)
+ {
+ // Computing the theoretical input starting/ending points
+ const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left;
+ const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top;
+ const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front;
+
+ const int pool_start_x = std::max(0, -in_idx_width);
+ const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x);
+ const int pool_start_y = std::max(0, -in_idx_height);
+ const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y);
+
+ const int pool_start_z = std::max(0, -in_idx_depth);
+ const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z);
+
+ // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z
+ const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width);
+ const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height);
+ const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth);
+
+ const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride;
+
+ // Calculate scale
+ const float scale = calculate_avg_scale(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left,
+ pool_pad_top, pool_pad_front, pool_stride_x,
+ pool_stride_y, pool_stride_z);
+
+ int x_off = window_start_x;
+
+ for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C
+ {
+ // Perform pooling
+ vres = wrapper::vdup_n(static_cast<T>(0.0f), tag_type());
+ for(int z = pool_start_z; z < pool_end_z; ++z)
+ {
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for(int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for(int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const vector_type data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ vres = wrapper::vmla(vres, data, data);
+ }
+ }
+ }
+
+ const vector_type scale_v = wrapper::vdup_n(static_cast<T>(scale), tag_type());
+
+ // Divide by scale
+ vres = wrapper::vmul(vres, scale_v);
+
+ // Calculate square-root
+ vres = wrapper::vinv(wrapper::vinvsqrt(vres));
+
+ // Store result
+ wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, vres);
+ }
+
+ // Left-overs loop
+ for(; x_off < window_end_x; ++x_off)
+ {
+ T res(0);
+
+ for(int z = pool_start_z; z < pool_end_z; ++z)
+ {
+ const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride;
+ for(int y = pool_start_y; y < pool_end_y; ++y)
+ {
+ const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride;
+ for(int x = pool_start_x; x < pool_end_x; ++x)
+ {
+ const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride;
+ const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off);
+ res += data * data;
+ }
+ }
+ }
+
+ // Divide by scale
+ res *= scale;
+
+ // Square root
+ res = std::sqrt(res);
+
+ // Store result
+ *(reinterpret_cast<T *>(out.ptr()) + x_off) = res;
+ }
+ },
+ out);
+}
+} // namespace
+
+template <typename T>
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window)
+{
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
+ constexpr int window_step_x = 16 / sizeof(T);
+ Window window_out = window;
+
+ // Needed to handle loop left-over
+ window_out.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ switch(pool_info.pool_type)
+ {
+ case PoolingType::MAX:
+ max_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+ break;
+ case PoolingType::AVG:
+ avg_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+ break;
+ case PoolingType::L2:
+ l2_poolingMxNxD_fp_neon_ndhwc<T>(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x);
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Pool operation not supported");
+ }
+}
+
+template void poolingMxNxD_fp_neon_ndhwc<float>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS)
+template void poolingMxNxD_fp_neon_ndhwc<float16_t>(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+#endif /* defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) */
+} // namespace cpu
+} // namespace arm_compute
diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h
new file mode 100644
index 0000000000..829a9bd192
--- /dev/null
+++ b/src/cpu/kernels/pool3d/neon/impl.h
@@ -0,0 +1,42 @@
+/*
+ * Copyright (c) 2022 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef SRC_CORE_POOLING_3D_LAYER_IMPL_H
+#define SRC_CORE_POOLING_3D_LAYER_IMPL_H
+
+#include "arm_compute/core/Helpers.h"
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+class Window;
+struct Pooling3dLayerInfo;
+namespace cpu
+{
+template <typename T>
+void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window);
+
+} // namespace cpu
+} // namespace arm_compute
+#endif //define SRC_CORE_POOLING_3D_LAYER_IMPL_H