diff options
Diffstat (limited to 'src/cpu/kernels/pool3d/neon/quantized.h')
-rw-r--r-- | src/cpu/kernels/pool3d/neon/quantized.h | 403 |
1 files changed, 206 insertions, 197 deletions
diff --git a/src/cpu/kernels/pool3d/neon/quantized.h b/src/cpu/kernels/pool3d/neon/quantized.h index ac14f5eafa..8819907901 100644 --- a/src/cpu/kernels/pool3d/neon/quantized.h +++ b/src/cpu/kernels/pool3d/neon/quantized.h @@ -26,17 +26,18 @@ #include "arm_compute/core/ITensor.h" #include "arm_compute/core/Types.h" -#include "src/core/NEON/wrapper/wrapper.h" + #include "src/core/helpers/PoolingHelpers.h" #include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/wrapper/wrapper.h" namespace arm_compute { namespace cpu { template <typename T> -void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void avg_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; @@ -89,144 +90,147 @@ void avg_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const float quant_rescale = dst_qinfo.scale / src_qinfo.scale; // "new_offset" doesn't have to consider the "half_scale_v" in its computation // With a requantization performed in a single step there won't be uncertainties introduced - const int32_t new_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + const int32_t new_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / quant_rescale); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - // Calculate scale - const float scale = calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, - pool_pad_top, pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // Calculate scale + const float scale = + calculate_avg_scale_pool3d(pool_info.exclude_padding, id, pool_size_x, pool_size_y, pool_size_z, + upper_bound_w, upper_bound_h, upper_bound_d, pool_pad_left, pool_pad_top, + pool_pad_front, pool_stride_x, pool_stride_y, pool_stride_z); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); - q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32x4_t vres1 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres2 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres3 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + q32x4_t vres4 = wrapper::vdup_n(static_cast<q32_t>(0.f), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); - const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); - vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); - vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); - vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); - vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + const q16x8_t data_q16 = wrapper::vmovl(wrapper::vgetlow(data)); + const q16x8_t data2_q16 = wrapper::vmovl(wrapper::vgethigh(data)); + vres1 = wrapper::vadd(vres1, wrapper::vmovl(wrapper::vgetlow(data_q16))); + vres2 = wrapper::vadd(vres2, wrapper::vmovl(wrapper::vgethigh(data_q16))); + vres3 = wrapper::vadd(vres3, wrapper::vmovl(wrapper::vgetlow(data2_q16))); + vres4 = wrapper::vadd(vres4, wrapper::vmovl(wrapper::vgethigh(data2_q16))); + } } } - } - if(src_qinfo != dst_qinfo) - { - const float32x4x4_t vres = + if (src_qinfo != dst_qinfo) { - { + const float32x4x4_t vres = {{ vcvtq_f32_q32(vres1), vcvtq_f32_q32(vres2), vcvtq_f32_q32(vres3), vcvtq_f32_q32(vres4), - } - }; - const auto requantized_dst = vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); - } - else - { - const float32x4_t scale_v = vdupq_n_f32(scale); - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); - vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); - vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); - vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); - - const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); - const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + }}; + const auto requantized_dst = + vrequantize_pooling_with_scale<q8x16_t>(vres, quant_rescale, scale, new_offset); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, wrapper::vgetlow(requantized_dst)); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, wrapper::vgethigh(requantized_dst)); + } + else + { + const float32x4_t scale_v = vdupq_n_f32(scale); + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + vres1 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres1), scale_v)); + vres2 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres2), scale_v)); + vres3 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres3), scale_v)); + vres4 = vcvtq_q32_f32<q32x4_t>(wrapper::vmla(half_scale_v, vcvtq_f32_q32(vres4), scale_v)); + + const q8x8_t res1 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres1), wrapper::vmovn(vres2))); + const q8x8_t res2 = wrapper::vmovn(wrapper::vcombine(wrapper::vmovn(vres3), wrapper::vmovn(vres4))); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, res1); + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off + 8, res2); + } } - } - - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - q32_t res = static_cast<q32_t>(0.f); - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q32_t res = static_cast<q32_t>(0.f); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); - res += data; + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + res += data; + } } } - } - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast<float>(res); - const float new_scale = quant_rescale / scale; - const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + const float new_scale = quant_rescale / scale; + const auto requantized_dst = quantize<T>(res_f, UniformQuantizationInfo(new_scale, new_offset)); - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; - } - else - { - // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero - res = static_cast<T>(0.5f + static_cast<float>(res) * scale); + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = requantized_dst; + } + else + { + // Divide by scale and add 0.5f to round to nearest instead of rounding towards zero + res = static_cast<T>(0.5f + static_cast<float>(res) * scale); - // Store result - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + // Store result + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } template <typename T> -void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, - const int window_step_x) +void max_poolingMxNxD_q8_neon_ndhwc( + const ITensor *src, ITensor *dst0, Pooling3dLayerInfo &pool_info, const Window &window_out, const int window_step_x) { using q8x8_t = typename wrapper::traits::neon_vector<T, 8>::type; @@ -266,125 +270,130 @@ void max_poolingMxNxD_q8_neon_ndhwc(const ITensor *src, ITensor *dst0, Pooling3d const UniformQuantizationInfo src_qinfo = src->info()->quantization_info().uniform(); const UniformQuantizationInfo dst_qinfo = dst0->info()->quantization_info().uniform(); - const float requant_scale = dst_qinfo.scale / src_qinfo.scale; - const int32_t requant_offset = dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); - const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - - execute_window_loop(window_out, [&](const Coordinates & id) - { - // Computing the theoretical input starting/ending points - const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; - const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; - const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; + const float requant_scale = dst_qinfo.scale / src_qinfo.scale; + const int32_t requant_offset = + dst_qinfo.offset - static_cast<int32_t>(static_cast<float>(src_qinfo.offset) / requant_scale); + const UniformQuantizationInfo requant_qinfo = UniformQuantizationInfo(requant_scale, requant_offset); - const int pool_start_x = std::max(0, -in_idx_width); - const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); - const int pool_start_y = std::max(0, -in_idx_height); - const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // Computing the theoretical input starting/ending points + const int in_idx_width = static_cast<int>(id.y()) * pool_stride_x - pool_pad_left; + const int in_idx_height = static_cast<int>(id.z()) * pool_stride_y - pool_pad_top; + const int in_idx_depth = static_cast<int>(id[3]) * pool_stride_z - pool_pad_front; - const int pool_start_z = std::max(0, -in_idx_depth); - const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); + const int pool_start_x = std::max(0, -in_idx_width); + const int pool_end_x_t = std::min(input_dim_w + pool_pad_left - in_idx_width, pool_size_x); + const int pool_start_y = std::max(0, -in_idx_height); + const int pool_end_y_t = std::min(input_dim_h + pool_pad_top - in_idx_height, pool_size_y); - // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z - const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); - const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); - const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); + const int pool_start_z = std::max(0, -in_idx_depth); + const int pool_end_z_t = std::min(input_dim_d + pool_pad_front - in_idx_depth, pool_size_z); - const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; + // The end of width to consider in calculation should exclude PAD_X, PAD_Y and PAD_Z + const int pool_end_x = std::min(pool_end_x_t, input_dim_w - in_idx_width); + const int pool_end_y = std::min(pool_end_y_t, input_dim_h - in_idx_height); + const int pool_end_z = std::min(pool_end_z_t, input_dim_d - in_idx_depth); - int x_off = window_start_x; + const uint8_t *in_ptr_n = in_ptr_start + id[4] * n_stride; - for(; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C - { - q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + int x_off = window_start_x; - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + for (; x_off <= (window_end_x - window_step_x); x_off += window_step_x) // C { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x16_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_128_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x16_t data = wrapper::vloadq(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), wrapper::vgethigh(vres), - requant_qinfo) : - vres); - } - // Leftovers using half the window step - for(; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) - { - q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) + ? vrequantize_pooling<q8x8_t, q8x16_t>(wrapper::vgetlow(vres), + wrapper::vgethigh(vres), requant_qinfo) + : vres); + } - // Perform pooling - for(int z = pool_start_z; z < pool_end_z; ++z) + // Leftovers using half the window step + for (; x_off <= (window_end_x - window_half_step_x); x_off += window_half_step_x) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + q8x8_t vres = wrapper::vdup_n(std::numeric_limits<T>::min(), wrapper::traits::vector_64_tag{}); + + // Perform pooling + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - vres = wrapper::vmax(vres, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const q8x8_t data = wrapper::vload(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + vres = wrapper::vmax(vres, data); + } } } - } - - // Store result - wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, - (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); - } - // Left-overs loop - for(; x_off < window_end_x; ++x_off) - { - T res = std::numeric_limits<T>::min(); + // Store result + wrapper::vstore(reinterpret_cast<T *>(out.ptr()) + x_off, + (src_qinfo != dst_qinfo) ? vrequantize_pooling<q8x8_t>(vres, requant_qinfo) : vres); + } - for(int z = pool_start_z; z < pool_end_z; ++z) + // Left-overs loop + for (; x_off < window_end_x; ++x_off) { - const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; - for(int y = pool_start_y; y < pool_end_y; ++y) + T res = std::numeric_limits<T>::min(); + + for (int z = pool_start_z; z < pool_end_z; ++z) { - const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; - for(int x = pool_start_x; x < pool_end_x; ++x) + const uint8_t *in_ptr_z = in_ptr_n + (z + in_idx_depth) * w_stride; + for (int y = pool_start_y; y < pool_end_y; ++y) { - const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; - const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); - - res = std::max(res, data); + const uint8_t *in_ptr_y = in_ptr_z + (y + in_idx_height) * z_stride; + for (int x = pool_start_x; x < pool_end_x; ++x) + { + const uint8_t *in_ptr_x = in_ptr_y + (x + in_idx_width) * y_stride; + const T data = *(reinterpret_cast<const T *>(in_ptr_x) + x_off); + + res = std::max(res, data); + } } } - } - // Store result - if(src_qinfo != dst_qinfo) - { - const float res_f = static_cast<float>(res); - *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); - } - else - { - *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + // Store result + if (src_qinfo != dst_qinfo) + { + const float res_f = static_cast<float>(res); + *(reinterpret_cast<T *>(out.ptr()) + x_off) = quantize<T>(res_f, requant_qinfo); + } + else + { + *(reinterpret_cast<T *>(out.ptr()) + x_off) = res; + } } - } - }, - out); + }, + out); } } // namespace cpu } // namespace arm_compute -#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H
\ No newline at end of file +#endif // SRC_CORE_NEON_KERNELS_POOL3D_QUANTIZED_H |