From afd38f0c617d6f89b2b4532c6c44f116617e2b6f Mon Sep 17 00:00:00 2001 From: Felix Thomasmathibalan Date: Wed, 27 Sep 2023 17:46:17 +0100 Subject: Apply clang-format on repository Code is formatted as per a revised clang format configuration file(not part of this delivery). Version 14.0.6 is used. Exclusion List: - files with .cl extension - files that are not strictly C/C++ (e.g. Android.bp, Sconscript ...) And the following directories - compute_kernel_writer/validation/ - tests/ - include/ - src/core/NEON/kernels/convolution/ - src/core/NEON/kernels/arm_gemm/ - src/core/NEON/kernels/arm_conv/ - data/ There will be a follow up for formatting of .cl files and the files under tests/ and compute_kernel_writer/validation/. Signed-off-by: Felix Thomasmathibalan Change-Id: Ib7eb1fcf4e7537b9feaefcfc15098a804a3fde0a Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/10391 Benchmark: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Gunes Bayir --- src/cpu/kernels/pool3d/neon/impl.h | 417 +++++++++++++++++--------------- src/cpu/kernels/pool3d/neon/quantized.h | 403 +++++++++++++++--------------- 2 files changed, 428 insertions(+), 392 deletions(-) (limited to 'src/cpu/kernels/pool3d/neon') diff --git a/src/cpu/kernels/pool3d/neon/impl.h b/src/cpu/kernels/pool3d/neon/impl.h index 013e25537c..ce89199b5d 100644 --- a/src/cpu/kernels/pool3d/neon/impl.h +++ b/src/cpu/kernels/pool3d/neon/impl.h @@ -25,9 +25,10 @@ #define SRC_CORE_POOLING_3D_LAYER_IMPL_H #include "arm_compute/core/Helpers.h" -#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" + #include "src/core/helpers/PoolingHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/intrinsics/intrinsics.h" #include "src/cpu/kernels/pool3d/neon/quantized.h" namespace arm_compute @@ -37,8 +38,13 @@ namespace cpu namespace { template -void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_start_x, const int window_end_x, const int window_step_x) +void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector; @@ -71,80 +77,87 @@ void max_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - vres = wrapper::vdup_n(static_cast(-std::numeric_limits::infinity()), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + vres = wrapper::vdup_n(static_cast(-std::numeric_limits::infinity()), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + vres = wrapper::vmax(vres, data); + } } } + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); } - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); - res = -std::numeric_limits::infinity(); - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + res = -std::numeric_limits::infinity(); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res = std::max(res, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res = std::max(res, data); + } } } + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; } - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - out); + }, + out); } template -void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, - const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x) +void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector; using vector_type = typename vtype::type; @@ -183,95 +196,103 @@ void avg_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, - pool_stride_y, pool_stride_z); - const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - // Perform pooling - vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Perform pooling + vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - vres = wrapper::vadd(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + vres = wrapper::vadd(vres, data); + } } } - } - // Divide by scale - vres = wrapper::vmul(vres, scale_v); + // Divide by scale + vres = wrapper::vmul(vres, scale_v); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res += data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res += data; + } } } - } - // Divide by scale - res *= scale; + // Divide by scale + res *= scale; - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - out); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + } + }, + out); } template -void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, - const Window &window_out, const int window_start_x, const int window_end_x, const int window_step_x) +void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, + ITensor *dst0, + Pooling3dLayerInfo &pool_info, + const Window &window_out, + const int window_start_x, + const int window_end_x, + const int window_step_x) { using vtype = wrapper::traits::neon_bitvector; using vector_type = typename vtype::type; @@ -310,97 +331,100 @@ void l2_poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dL Iterator out(dst0, window_out); vector_type vres; - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, - pool_stride_y, pool_stride_z); + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); - int x_off = window_start_x; + int x_off = window_start_x; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - // Perform pooling - vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + // Perform pooling + vres = wrapper::vdup_n(static_cast(0.0f), tag_type()); + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - vres = wrapper::vmla(vres, data, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const vector_type data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + vres = wrapper::vmla(vres, data, data); + } } } - } - - const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); - // Divide by scale - vres = wrapper::vmul(vres, scale_v); + const vector_type scale_v = wrapper::vdup_n(static_cast(scale), tag_type()); - // Calculate square-root - vres = wrapper::vinv(wrapper::vinvsqrt(vres)); + // Divide by scale + vres = wrapper::vmul(vres, scale_v); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); - } + // Calculate square-root + vres = wrapper::vinv(wrapper::vinvsqrt(vres)); - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res(0); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res(0); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res += data * data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res += data * data; + } } } - } - // Divide by scale - res *= scale; + // Divide by scale + res *= scale; - // Square root - res = std::sqrt(res); + // Square root + res = std::sqrt(res); - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; - } - }, - out); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + } + }, + out); } } // namespace @@ -415,16 +439,19 @@ void poolingMxNxD_fp_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye // Needed to handle loop left-over window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - switch(pool_info.pool_type) + switch (pool_info.pool_type) { case PoolingType::MAX: - max_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + max_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; case PoolingType::AVG: - avg_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + avg_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; case PoolingType::L2: - l2_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, window_step_x); + l2_poolingMxNxD_fp_neon_ndhwc(src, dst0, pool_info, window_out, window_start_x, window_end_x, + window_step_x); break; default: ARM_COMPUTE_ERROR("Pool operation not supported"); @@ -440,7 +467,7 @@ void poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLaye // Needed to handle loop left-over window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); - switch(pool_info.pool_type) + switch (pool_info.pool_type) { case PoolingType::MAX: max_poolingMxNxD_q8_neon_ndhwc(src, dst0, pool_info, window_out, window_step_x); diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h index ac14f5eafa..8819907901 100644 --- a/src/cpu/kernels/pool3d/neon/quantized.h +++ b/src/cpu/kernels/pool3d/neon/quantized.h @@ -26,17 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/PoolingHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { template -void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void avg_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector::type; @@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; // "new_offset" doesn't have to consider the "half_scale_v" in its computation // With a requantization performed in a single step there won't be uncertainties introduced - const int32_t new_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + const int32_t new_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / quant_rescale); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q32x4_t vres1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32x4_t vres1 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast(0.f), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - - const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); - const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); - vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); - vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); - vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); - vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } } } - } - if(src_qinfo != dst_qinfo) - { - const float32x4x4_t vres = + if (src_qinfo != dst_qinfo) { - { + const float32x4x4_t vres = {{ vcvtq_f32_q32(vres1), vcvtq_f32_q32(vres2), vcvtq_f32_q32(vres3), vcvtq_f32_q32(vres4), - } - }; - const auto requantized_dst = vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); - } - else - { - const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, res1); - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, res2); + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off + 8, res2); + } } - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - q32_t res = static_cast(0.f); - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32_t res = static_cast(0.f); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - res += data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + res += data; + } } } - } - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - const float new_scale = quant_rescale / scale; - const auto requantized_dst = quantize(res_f, UniformQuantizationInfo(new_scale, new_offset)); + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize(res_f, UniformQuantizationInfo(new_scale, new_offset)); - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = requantized_dst; - } - else - { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast(0.5f + static_cast(res) * scale); + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast(0.5f + static_cast(res) * scale); - // Store result - *(reinterpret_cast(out.ptr()) + x_off) = res; + // Store result + *(reinterpret_cast(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } template -void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void max_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector::type; @@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast(static_cast(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q8x16_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x16_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : - vres); - } - // Leftovers using half the window step - for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) - { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Leftovers using half the window step + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x8_t vres = wrapper::vdup_n(std::numeric_limits::min(), wrapper::traits::vector_64_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x8_t data = wrapper::vload(reinterpret_cast(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x8_t data = wrapper::vload(reinterpret_cast(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, - (src_qinfo != dst_qinfo) ? vrequantize_pooling(vres, requant_qinfo) : vres); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res = std::numeric_limits::min(); + // Store result + wrapper::vstore(reinterpret_cast(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling(vres, requant_qinfo) : vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res = std::numeric_limits::min(); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast(in_ptr_x) + x_off); - - res = std::max(res, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast(in_ptr_x) + x_off); + + res = std::max(res, data); + } } } - } - // Store result - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast(res); - *(reinterpret_cast(out.ptr()) + x_off) = quantize(res_f, requant_qinfo); - } - else - { - *(reinterpret_cast(out.ptr()) + x_off) = res; + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast(res); + *(reinterpret_cast(out.ptr()) + x_off) = quantize(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H \ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H -- cgit v1.2.1