diff options
Diffstat (limited to 'src/cpu/kernels/directconv2d')
-rw-r--r-- | src/cpu/kernels/directconv2d/impl.h | 389 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/list.h | 77 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nchw/all.cpp | 147 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nchw/fp16.cpp | 84 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nchw/impl.h | 166 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp | 75 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp | 40 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp | 270 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nhwc/neon/impl.h | 45 | ||||
-rw-r--r-- | src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp | 65 |
10 files changed, 1358 insertions, 0 deletions
diff --git a/src/cpu/kernels/directconv2d/impl.h b/src/cpu/kernels/directconv2d/impl.h new file mode 100644 index 0000000000..d3965326cd --- /dev/null +++ b/src/cpu/kernels/directconv2d/impl.h @@ -0,0 +1,389 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template <typename T, bool has_pads> +void linearize_volume_nchw(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int top_left_x, + int top_left_y, + int kernel_width, + int kernel_height, + int kernel_depth, + int input_w, + int input_h, + int input_stride_x, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y) +{ + const int kernel_size2 = kernel_width * kernel_height; + const int x_e = top_left_x + kernel_width * dilation_x; + const int y_e = top_left_y + kernel_height * dilation_y; + + // Linearize volume + int d = 0; + // This for loop linearize a volume with 3 slices. This allows: + // 1) to reduce the iterations of the outer for loop "d" + // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs + for (; d <= (kernel_depth - 3); d += 3) + { + for (int y = top_left_y; y < y_e; y += dilation_y) + { + if ((y < 0 || y >= input_h) && has_pads) + { + // All the values will be the offset (will be zeros when not quantized) + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + *(out_ptr + 0 * kernel_size2) = pad_value; + *(out_ptr + 1 * kernel_size2) = pad_value; + *(out_ptr + 2 * kernel_size2) = pad_value; + } + } + else + { + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + if ((x < 0 || x >= input_w) && has_pads) + { + *(out_ptr + 0 * kernel_size2) = pad_value; + *(out_ptr + 1 * kernel_size2) = pad_value; + *(out_ptr + 2 * kernel_size2) = pad_value; + } + else + { + *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast<const T *>( + in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + out_ptr += 2 * kernel_size2; + } + + // Left over + for (; d < kernel_depth; d++) + { + for (int y = top_left_y; y < y_e; y += dilation_y) + { + if ((y < 0 || y >= input_h) && has_pads) + { + // All the values will be the offset (will be zeros when not quantized) + memset(static_cast<void *>(out_ptr), pad_value, kernel_width * sizeof(T)); + out_ptr += kernel_width; + } + else + { + for (int x = top_left_x; x < x_e; x += dilation_x, ++out_ptr) + { + if ((x < 0 || x >= input_w) && has_pads) + { + *out_ptr = pad_value; + } + else + { + *out_ptr = *(reinterpret_cast<const T *>( + in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + } + + // Append 1 if the convolution layer has biases + if (has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + +template <typename T, bool has_pads> +void linearize_volume_nhwc(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int start_x, + int start_y, + int kernel_width, + int kernel_height, + int input_w, + int input_h, + int input_c, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y) +{ + const int end_x = start_x + kernel_width * dilation_x; + const int end_y = start_y + kernel_height * dilation_y; + const int pad_quant = kernel_width * input_c; + const int element_size = static_cast<int>(sizeof(T)); + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == input_c * element_size)) + { + for (int y = start_y; y < end_y; y += dilation_y) + { + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); + out_ptr += input_c * kernel_width; + } + } + else + { + for (int y = start_y; y < end_y; y += dilation_y) + { + if (y < 0 || y >= input_h) + { + memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); + out_ptr += pad_quant; + } + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != input_c * element_size) + { + for (int x = start_x; x < end_x; x += dilation_x) + { + if (x < 0 || x >= input_w) + { + memset(static_cast<void *>(out_ptr), pad_value, input_c * element_size); + out_ptr += input_c; + } + else + { + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), + input_c * element_size); + out_ptr += input_c; + } + } + } + else + { + //optimized for no dilation and no boundary pixels + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + start_x * input_stride_y)), + input_c * kernel_width * element_size); + out_ptr += input_c * kernel_width; + } + } + } + // Append 1 if the convolution layer has biases + if (has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + +template <typename T, bool has_pads> +void linearize_volume_nhwc(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int start_x, + int start_y, + int kernel_width, + int kernel_height, + int input_w, + int input_h, + int input_c, + int input_stride_y, + int input_stride_z, + int pad_value, + int dilation_x, + int dilation_y, + int pad_right) +{ + const int end_x = start_x + kernel_width * dilation_x; + const int end_y = start_y + kernel_height * dilation_y; + const int pad_quant = kernel_width * (input_c + pad_right); + const int element_size = static_cast<int>(sizeof(T)); + const int channel_chunk_size = input_c * element_size; + + if ((start_y >= 0) && (end_y < input_h) && (start_x >= 0) && (end_x < input_w) && (dilation_x == 1) && + (input_stride_y == channel_chunk_size)) + { + for (int y = start_y; y < end_y; y += dilation_y) + { + const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); + for (int e = 0; e < kernel_width; e++) + { + memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + else + { + for (int y = start_y; y < end_y; y += dilation_y) + { + if (y < 0 || y >= input_h) + { + memset(static_cast<void *>(out_ptr), pad_value, pad_quant * element_size); + out_ptr += pad_quant; + } + else if (dilation_x > 1 || start_x < 0 || end_x >= input_w || input_stride_y != channel_chunk_size) + { + for (int x = start_x; x < end_x; x += dilation_x) + { + if (x < 0 || x >= input_w) + { + memset(static_cast<void *>(out_ptr), pad_value, (input_c + pad_right) * element_size); + out_ptr += input_c + pad_right; + } + else + { + memcpy(out_ptr, reinterpret_cast<const T *>(in_ptr + (y * input_stride_z + x * input_stride_y)), + channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + else + { + const uint8_t *offset_ptr = in_ptr + (y * input_stride_z + start_x * input_stride_y); + for (int e = 0; e < kernel_width; e++) + { + memcpy(out_ptr, reinterpret_cast<const T *>(offset_ptr + e * channel_chunk_size), + channel_chunk_size); + out_ptr += input_c + pad_right; + } + } + } + } + // Append 1 if the convolution layer has biases + if (has_bias) + { + *out_ptr = static_cast<T>(1); + } +} + +template <typename T, bool has_pads, bool is_nchw> +void run_im2col(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + const int input_w = src->info()->dimension(width_idx); + const int input_h = src->info()->dimension(height_idx); + const int input_c = src->info()->dimension(channel_idx); + const int input_stride_x = src->info()->strides_in_bytes().x(); + const int input_stride_y = src->info()->strides_in_bytes().y(); + const int input_stride_z = src->info()->strides_in_bytes().z(); + const int pad_left = conv_info.pad_left(); + const int pad_top = conv_info.pad_top(); + const int stride_x = conv_info.stride().first; + const int stride_y = conv_info.stride().second; + const int pad_value = + is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0; + + const auto kernel_width = kernel_dims.width; + const auto kernel_height = kernel_dims.height; + + Window window_in_out(window); + // The first three dimensions of the input and output are increased by the inner loops + window_in_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Create iterators + Iterator in(src, window_in_out); + Iterator out(dst, window_in_out); + + execute_window_loop( + window, + [&](const Coordinates &id) + { + const int start_w = id[width_idx] * stride_x - pad_left; + const int start_h = id[height_idx] * stride_y - pad_top; + + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + auto output_ptr = + reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * convolved_dims.first) * + dst->info()->strides_in_bytes().y()); + + // Linearize volume + if (is_nchw) + { + linearize_volume_nchw<T, has_pads>( + input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, kernel_height, input_c, input_w, + input_h, input_stride_x, input_stride_y, input_stride_z, pad_value, dilation.x(), dilation.y()); + } + else + { + if (input_pad_right > 0) + { + linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, + kernel_height, input_w, input_h, input_c, input_stride_y, + input_stride_z, pad_value, dilation.x(), dilation.y(), + input_pad_right); + } + else + { + linearize_volume_nhwc<T, has_pads>(input_ptr, output_ptr, has_bias, start_w, start_h, kernel_width, + kernel_height, input_w, input_h, input_c, input_stride_y, + input_stride_z, pad_value, dilation.x(), dilation.y()); + } + } + }, + in, out); +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_IMPL_H diff --git a/src/cpu/kernels/directconv2d/list.h b/src/cpu/kernels/directconv2d/list.h new file mode 100644 index 0000000000..e3ff46b148 --- /dev/null +++ b/src/cpu/kernels/directconv2d/list.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/common/Registrars.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +#define DECLARE_DIRECT_CONV2D_KERNEL(func_name) \ + void func_name(const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, \ + const PadStrideInfo &conv_info) + +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nhwc_directconv2d); +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp16_nchw_directconv2d); +DECLARE_DIRECT_CONV2D_KERNEL(neon_fp32_nchw_directconv2d); + +#define DECLARE_IM2COL_KERNEL(func_name) \ + void func_name(const ITensor *src, ITensor *dst, const Window &window, DataLayout data_layout, \ + const PadStrideInfo &conv_info, std::pair<unsigned int, unsigned int> convolved_dims, \ + const Size2D &kernel_dims, const Size2D &dilation, uint32_t input_pad_right, bool has_bias) + +DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp32_nchw_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_nchw_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_nchw_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_pad); +DECLARE_IM2COL_KERNEL(run_im2col_qasymm8_nchw_nopad); + +DECLARE_IM2COL_KERNEL(run_im2col_fp32_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp32_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_pad); +DECLARE_IM2COL_KERNEL(run_im2col_fp16_nopad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_pad); +DECLARE_IM2COL_KERNEL(run_im2col_bf16_nopad); + +#undef DECLARE_DIRECT_CONV2D_KERNEL +#undef DECLARE_IM2COL_KERNEL + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_LIST_H diff --git a/src/cpu/kernels/directconv2d/nchw/all.cpp b/src/cpu/kernels/directconv2d/nchw/all.cpp new file mode 100644 index 0000000000..84f5eeff5a --- /dev/null +++ b/src/cpu/kernels/directconv2d/nchw/all.cpp @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/list.h" +#include "src/cpu/kernels/directconv2d/nchw/impl.h" +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void neon_fp32_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nchw<float>(window, src, weights, dst, conv_info); +} + +void run_im2col_fp32_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<float, true, true>(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_fp32_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<float, false, true>(src, dst, window, data_layout, conv_info, convolved_dims, + kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_qasymm8_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_qasymm8_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} +#if defined(ARM_COMPUTE_ENABLE_BF16) +void run_im2col_bf16_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<bfloat16, true, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_bf16_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<bfloat16, false, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} +#endif /* defined(ARM_COMPUTE_ENABLE_BF16) */ + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nchw/fp16.cpp b/src/cpu/kernels/directconv2d/nchw/fp16.cpp new file mode 100644 index 0000000000..a9cab42f56 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nchw/fp16.cpp @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/nchw/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +void neon_fp16_nchw_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nchw<float16_t>(window, src, weights, dst, conv_info); +} +#endif //defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + +void run_im2col_fp16_nchw_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, true, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} + +void run_im2col_fp16_nchw_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, false, true>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nchw/impl.h b/src/cpu/kernels/directconv2d/nchw/impl.h new file mode 100644 index 0000000000..6a5b175d98 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nchw/impl.h @@ -0,0 +1,166 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H +#define ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template <typename T> +void convolve_nchw( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_UNUSED(conv_info); + + // Declare useful types + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes()[0] / element_size; + const int input_stride_h = src->info()->strides_in_bytes()[1] / element_size; + const int input_stride_c = src->info()->strides_in_bytes()[2] / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + + const int input_dim_w = src->info()->dimension(0); + const int input_dim_h = src->info()->dimension(1); + + const int output_stride_c = dst->info()->strides_in_bytes()[2]; + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().x() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_c = weights->info()->strides_in_bytes().z() / element_size; + + const int kernel_dim_w = weights->info()->dimension(0); + const int kernel_dim_h = weights->info()->dimension(1); + + const int conv_pad_top = conv_info.pad_top(); + const int conv_pad_left = conv_info.pad_left(); + const int conv_stride_w = std::get<0>(conv_info.stride()); + const int conv_stride_h = std::get<1>(conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.x()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.y()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(2); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + T out_temp = static_cast<T>(0); + + for (int index_wei_c = 0, index_in_c = 0; index_wei_c < index_c_end; ++index_wei_c, ++index_in_c) + { + const T *const in_ptr_row_0 = in_ptr_start + index_in_c * input_stride_c; + const T *const weights_ptr_row_0 = weights_ptr_start + index_wei_c * kernel_stride_c; + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *in_ptr_row = in_ptr_row_0 + index_in_h * input_stride_h; + const T *weights_ptr_row = weights_ptr_row_0 + index_wei_h * kernel_stride_h; + int index_w = in_w_start; + int index_wei_w = wei_w_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_w <= ((in_w_end - num_elems_read_per_iteration)); + index_w += num_elems_read_per_iteration, index_wei_w += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_row + index_w * input_stride_w); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_w < in_w_end; ++index_w, ++index_wei_w) + { + const auto src_val = *(in_ptr_row + index_w * input_stride_w); + const auto w_val = *(weights_ptr_row + index_wei_w * kernel_stride_w); + out_temp += src_val * w_val; + } + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute +#endif // ACL_SRC_CPU_KERNELS_DIRECTCONV2D_NCHW_IMPL_H diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp new file mode 100644 index 0000000000..f78601544f --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp16.cpp @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/nchw/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void run_im2col_fp16_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, true, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} + +void run_im2col_fp16_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ +#if defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + arm_compute::cpu::kernels::run_im2col<float16_t, false, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +#else // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) + ARM_COMPUTE_UNUSED(src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, + has_bias); +#endif // defined(__ARM_FEATURE_FP16_VECTOR_ARITHMETIC) && defined(ENABLE_FP16_KERNELS) +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp new file mode 100644 index 0000000000..17d9212248 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/fp32.cpp @@ -0,0 +1,40 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void neon_fp32_nhwc_directconv2d( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + convolve_nhwc<float>(window, src, weights, dst, conv_info); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp new file mode 100644 index 0000000000..f235167e28 --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.cpp @@ -0,0 +1,270 @@ +/* + * Copyright (c) 2018-2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" + +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <algorithm> + +using namespace arm_compute::detail; + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +namespace +{ +bool have_zero_x_internal_padding(ITensorInfo *src, const ITensorInfo *weights) +{ + return (src->padding().left == 0 && weights->padding().left == 0 && src->padding().right == 0 && + weights->padding().right == 0); +} +} // namespace + +template <typename T> +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info) +{ + // Declare useful types + using vtype = wrapper::traits::neon_bitvector<T, wrapper::traits::BitWidth::W128>; + using vector_type = typename vtype::type; + using tag_type = typename vtype::tag_type; + + // Scalar quantities + const int element_size = src->info()->element_size(); + const int input_stride_w = src->info()->strides_in_bytes().y() / element_size; + const int input_stride_h = src->info()->strides_in_bytes().z() / element_size; + const int input_stride_n = src->info()->strides_in_bytes()[3] / element_size; + const int input_dim_w = src->info()->dimension(1); + const int input_dim_h = src->info()->dimension(2); + + const int output_stride_c = dst->info()->strides_in_bytes().x(); + + const unsigned int kernel_stride_w = weights->info()->strides_in_bytes().y() / element_size; + const unsigned int kernel_stride_h = weights->info()->strides_in_bytes().z() / element_size; + const int kernel_dim_w = weights->info()->dimension(1); + const int kernel_dim_h = weights->info()->dimension(2); + + const int conv_pad_top = conv_info.pad_top(); + const int conv_pad_left = conv_info.pad_left(); + const int conv_stride_w = std::get<0>(conv_info.stride()); + const int conv_stride_h = std::get<1>(conv_info.stride()); + + // Setup input window for the output iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Setup input window for the weights iterator + Window window_w = calculate_max_window(*weights->info(), Steps()); + window_w.set(Window::DimX, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimY, Window::Dimension(0, 1, 1)); + window_w.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + Iterator out(dst, window_out); + Iterator wei(weights, window_w); + + constexpr int num_elems_read_per_iteration = 16 / sizeof(T); + + // nhwc optimized + if (have_zero_x_internal_padding(src->info(), weights->info())) + { + // This function assumes that input and weights have not padding in channel + + /* + * This implementation parallelize the full WC plane of input and weights by + * treating them as series of elements. So for example, a 3x3 weights and + * floating point vector operations of 4 elements per time, the first 3 + * channel elements of the first row would be taken and additionally the first + * element of the second row. The 9 elements in each single WC weight plane + * would require 2 4-element vector operations and a last single element operation. + * + * This works since when we create the input vector to multiply with the weights, + * the exact required elements are loaded in the same order. Therefore the + * multiplication works on the correct input/weight elements. + */ + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + /* + * In here we create theoretical indexes which then we validate for both + * inputs and weights. + * As a reminder, this loop take each output point in NHW, C is treated + * in the weights loop. + */ + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int index_wc_start = (in_w_start - in_w_start_t) * kernel_stride_w; + const int index_h_start = in_h_start - in_h_start_t; + const int index_wc_end = (kernel_dim_w - (in_w_end_t - in_w_end)) * kernel_stride_w; + const int index_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + /* + * This is the loop in the weights, and it goes along N (the batches) + * As a reminder, the batches of the weights are translated into the + * channels of the output + */ + const T *in_ptr_row = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n + in_w_start * input_stride_w + in_h_start * input_stride_h; + const T *weights_ptr_row = + reinterpret_cast<const T *>(wei.ptr()) + index_h_start * kernel_stride_h; + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for (int index_h = index_h_start; index_h < index_h_end; + ++index_h, in_ptr_row += input_stride_h, weights_ptr_row += kernel_stride_h) + { + const T *in_ptr_mover = in_ptr_row; + int index_wc = index_wc_start; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_wc <= index_wc_end - num_elems_read_per_iteration; + index_wc += num_elems_read_per_iteration, in_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_row + index_wc); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_wc < index_wc_end; ++index_wc, ++in_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_row + index_wc); + out_temp += src_val * w_val; + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); + } + else // nhwc non optimized + { + execute_window_loop( + window_out, + [&](const Coordinates &id) + { + // We are computing the theoretical starting input starting points + const int in_w_start_t = static_cast<int>(id.y()) * conv_stride_w - conv_pad_left; + const int in_h_start_t = static_cast<int>(id.z()) * conv_stride_h - conv_pad_top; + const int in_w_end_t = in_w_start_t + kernel_dim_w; + const int in_h_end_t = in_h_start_t + kernel_dim_h; + + // We are computing the valid initial and ending input points by checking the borders + const int in_w_start = std::max(in_w_start_t, 0); + const int in_h_start = std::max(in_h_start_t, 0); + const int in_w_end = std::min(in_w_end_t, input_dim_w); + const int in_h_end = std::min(in_h_end_t, input_dim_h); + + // We use the input points to select the valid weight points to use + const int wei_w_start = in_w_start - in_w_start_t; + const int wei_h_start = in_h_start - in_h_start_t; + const int wei_w_end = kernel_dim_w - (in_w_end_t - in_w_end); + const int wei_h_end = kernel_dim_h - (in_h_end_t - in_h_end); + + const int index_c_end = weights->info()->dimension(0); + const T *const in_ptr_start = + reinterpret_cast<const T *>(src->buffer() + src->info()->offset_first_element_in_bytes()) + + id[3] * input_stride_n; + + execute_window_loop( + window_w, + [&](const Coordinates &id_w) + { + const T *const weights_ptr_start = reinterpret_cast<const T *>(wei.ptr()); + uint8_t *out_ptr = out.ptr() + id_w[3] * output_stride_c; + + T out_temp = static_cast<T>(0); + for (int index_wei_h = wei_h_start, index_in_h = in_h_start; index_wei_h < wei_h_end; + ++index_wei_h, ++index_in_h) + { + const T *const in_ptr_row = in_ptr_start + index_in_h * input_stride_h; + const T *const weights_ptr_row = weights_ptr_start + index_wei_h * kernel_stride_h; + for (int index_wei_w = wei_w_start, index_in_w = in_w_start; index_wei_w < wei_w_end; + ++index_wei_w, ++index_in_w) + { + const T *in_ptr_mover = in_ptr_row + index_in_w * input_stride_w; + const T *weights_ptr_mover = weights_ptr_row + index_wei_w * kernel_stride_w; + int index_c = 0; + vector_type out_temp_vec = wrapper::vdup_n(static_cast<T>(0), tag_type()); + for (; index_c <= index_c_end - num_elems_read_per_iteration; + index_c += num_elems_read_per_iteration, + in_ptr_mover += num_elems_read_per_iteration, + weights_ptr_mover += num_elems_read_per_iteration) + { + const auto src_vec = wrapper::vloadq(in_ptr_mover); + const auto w_vec = wrapper::vloadq(weights_ptr_mover); + out_temp_vec = wrapper::vmla(out_temp_vec, w_vec, src_vec); + } + out_temp += vreduce(out_temp_vec); + for (; index_c < index_c_end; ++index_c, ++in_ptr_mover, ++weights_ptr_mover) + { + const auto src_val = *(in_ptr_mover); + const auto w_val = *(weights_ptr_mover); + out_temp += src_val * w_val; + } + } + } + *(reinterpret_cast<T *>(out_ptr)) = out_temp; + }, + wei); + }, + out); + } +} + +template void convolve_nhwc<float>( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); + +} // namespace kernels +} // namespace cpu +} // namespace arm_compute diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/impl.h b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h new file mode 100644 index 0000000000..efb9ce8e2a --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/impl.h @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H +#define SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H + +#include "arm_compute/core/ITensor.h" + +#include "src/core/helpers/WindowHelpers.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +template <typename T> +void convolve_nhwc( + const Window &window, const ITensor *src, const ITensor *weights, ITensor *dst, const PadStrideInfo &conv_info); +} // namespace kernels +} // namespace cpu +} // namespace arm_compute + +#endif //SRC_CORE_NEON_KERNELS_CONV2D_IMPL_H diff --git a/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp new file mode 100644 index 0000000000..4c6fbec63a --- /dev/null +++ b/src/cpu/kernels/directconv2d/nhwc/neon/qasymm8.cpp @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2022-2023 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#include "src/cpu/kernels/directconv2d/impl.h" +#include "src/cpu/kernels/directconv2d/nhwc/neon/impl.h" + +namespace arm_compute +{ +namespace cpu +{ +namespace kernels +{ +void run_im2col_qasymm8_pad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, true, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} + +void run_im2col_qasymm8_nopad(const ITensor *src, + ITensor *dst, + const Window &window, + DataLayout data_layout, + const PadStrideInfo &conv_info, + std::pair<unsigned int, unsigned int> convolved_dims, + const Size2D &kernel_dims, + const Size2D &dilation, + uint32_t input_pad_right, + bool has_bias) +{ + arm_compute::cpu::kernels::run_im2col<qasymm8_t, false, false>( + src, dst, window, data_layout, conv_info, convolved_dims, kernel_dims, dilation, input_pad_right, has_bias); +} +} // namespace kernels +} // namespace cpu +} // namespace arm_compute |