From c827e99fc46521f43719b0c2d1b6f05d66abf68c Mon Sep 17 00:00:00 2001 From: ramelg01 Date: Fri, 8 Apr 2022 03:52:28 +0100 Subject: =?UTF-8?q?Update=20Neon=E2=84=A2=20pooling=20kernel?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit - Reduce duplication and simplify overall structure. - Improve multi-threaded performance by sharing more data in lower-level caches. Partially Resolves: COMPMID-5054 Signed-off-by: Ramy Elgammal Change-Id: I5f4dc50913401d5c1cbfc10b866fae9490cbc4d7 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7404 Tested-by: Arm Jenkins Reviewed-by: Andrew Mundy Reviewed-by: Sheri Zhang Comments-Addressed: Arm Jenkins --- src/core/NEON/kernels/arm_conv/addressing.hpp | 263 ++++++++++++++++++++++++++ 1 file changed, 263 insertions(+) create mode 100644 src/core/NEON/kernels/arm_conv/addressing.hpp (limited to 'src/core/NEON/kernels/arm_conv/addressing.hpp') diff --git a/src/core/NEON/kernels/arm_conv/addressing.hpp b/src/core/NEON/kernels/arm_conv/addressing.hpp new file mode 100644 index 0000000000..35715a3764 --- /dev/null +++ b/src/core/NEON/kernels/arm_conv/addressing.hpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +/* arm_conv kernels share a lot of similarities in how they address input and + * output tensors. Consequently, this file contains common approaches to + * preparing these tensor descriptions. Generic (i.e., untyped) methods are + * contained within the `arm_conv::addressing` namespace, and typed wrappers + * are provided within an anonymous namespace within `arm_conv`. The various + * methods are described below. + */ + +#include + +namespace arm_conv { +namespace addressing { + +/* Pointer array + * ------------- + * + * Constructs an array of pointers which point to a `array_rows` x `array_cols` + * chunk of a tensor. The array of pointers will be written into `dest`. + * + * `base_ptr` should point at the first VALID element of the chunk of tensor + * (i.e., if there's one padded row, and one padded column, then `base_ptr` + * should point at the element which will be at position (1, 1) in the array). + * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and + * columns (respectively) of the NHWC-ordered tensor. `pad_buffer` should point + * at a suitably sized (and initialised) area of memory which can be addressed + * by elements of the array which represent padding. + * + * `pad_top` and `pad_left` describe the padding on the top and left of the + * array, respectively, and `valid_rows` and `valid_cols` describe the number + * of rows and columns between the element pointed to by `base_ptr` and the + * edge of the image (that is `valid_rows` may be greater than `array_rows` and + * likewise for the columns). + */ +void fill_pointer_array( + size_t element_size, + void **dest, unsigned int array_rows, unsigned int array_cols, + void *base_ptr, size_t ld_row, size_t ld_col, + void *pad_buffer, + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +); + +/* Interleaved multi-point pointer array + * ------------------------------------- + * + * For each point in a `output_rows` x `output_cols` array, constructs + * `kernel_rows` x `kernel_cols` array of pointers. The pointers are + * interleaved thusly: + * + * for ki in kernel_rows: + * for kj in kernel_cols: + * for oi in output_rows: + * for oj in output_cols: + * get pointer for point (oi*stride_rows + ki, oj*stride_cols + kj) + * + * Other arguments are as for `fill_pointer_array`. + * + * The name reflects that this is the form of addressing mode used by "generic" + * depthwise and pooling kernels. + */ +void fill_pointer_array_generic_kernel( + size_t element_size, + void **dest, + unsigned int output_rows, unsigned int output_cols, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows, unsigned int stride_cols, + void *base_ptr, size_t ld_row, size_t ld_col, + void *pad_buffer, + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +); + +/* NCHW-patch addressed by row + * --------------------------- + * + * Construct an array of pointers, each of which points at a row of an + * NCHW-ordered patch of a tensor. Memory addressed by the pointers may be + * outside of the original tensor, and should therefore not be written to + * (modifications will be lost). + * + * `dest_row_pointers` should point at a `patch_rows` list of pointers; each of + * which will point at a 1 x `patch_cols` NCHW-ordered sample of the source + * tensor. + * + * `dest_patch` should point to a `element_size * patch_rows * patch_cols` area + * of memory which can be written to by this function to form samples of the + * source tensor. + * + * `src_ptr` should point at the first VALID element of the chunk of tensor + * (i.e., if there's one padded row, and one padded column, then `src_ptr` + * should point at the element which will be at position (1, 1) in the array). + * `ld_row` and `ld_col` are in bytes, and describe the strides over rows and + * columns (respectively) of the NHWC-ordered tensor. If `ld_col` == + * `element_size` then copies from the source tensor will be elided and source + * data may be addressed directly. + * + * `pad_row` should point to a `patch_cols` array of (appropriately + * initialised) padding values. + * + * Other arguments are as for `fill_pointer_array`. + */ +void fill_nchw_patch_array( + size_t element_size, + const void **dest_row_pointers, // Array of pointers to each row of the patch + void *dest_patch, // Pointer to space which can be used to construct the patch + unsigned int patch_rows, unsigned int patch_cols, // Patch size + const void *src_ptr, size_t ld_row, size_t ld_col, // Source tensor + const void *pad_row, // Pointer to a row of padding values + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +); + +void fill_patch_array_generic_kernel( + size_t element_size, + const void **dest_pointers, // Pointers: one per output row per kernel point + void *dest_patch, // Pointer to space which can be used to construct the patch + unsigned int output_rows, unsigned int output_cols, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows, unsigned int stride_cols, + const void *src_ptr, size_t ld_row, size_t ld_col, // Source tensor + const void *pad_row, // Pointer to a row of padding values + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +); + +} // namespace addressing + +namespace { + +/* Pointer array + * ------------- + * + * See `addressing::fill_pointer_array`. No copies are made by this method, + * memory pointed to by the pointer array is contained within the base tensor + * and the padding buffer. + */ +template +inline void fill_pointer_array( + T **dest, unsigned int array_rows, unsigned int array_cols, + T *base_ptr, size_t ld_row, size_t ld_col, + T *pad_buffer, + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +) +{ + addressing::fill_pointer_array( + sizeof(T), (void **) dest, array_rows, array_cols, + (void *) base_ptr, ld_row, ld_col, + (void *) pad_buffer, + pad_top, valid_rows, + pad_left, valid_cols + ); +} + + +/* Interleaved multi-point pointer array + * ------------------------------------- + * + * See `addressing::fill_pointer_array_generic_kernel`. No copies are made by + * this method, memory pointed to by the pointer array is contained within the + * base tensor and the padding buffer. + */ +template +inline void fill_pointer_array_generic_kernel( + T **dest, + unsigned int output_rows, unsigned int output_cols, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows, unsigned int stride_cols, + T *base_ptr, size_t ld_row, size_t ld_col, + T *pad_buffer, + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +) +{ + addressing::fill_pointer_array_generic_kernel( + sizeof(T), + (void **) dest, + output_rows, output_cols, + kernel_rows, kernel_cols, + stride_rows, stride_cols, + (void *) base_ptr, ld_row, ld_col, + (void *) pad_buffer, + pad_top, valid_rows, + pad_left, valid_cols + ); +} + +template +inline void fill_nchw_patch_array( + const T **dest_row_pointers, // Array of pointers to each row of the patch + T *dest_patch, // Pointer to space which can be used to construct the patch + unsigned int patch_rows, unsigned int patch_cols, // Patch size + const T *src_ptr, size_t ld_row, size_t ld_col, // Source tensor + const T *pad_row, // Pointer to a row of padding values + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +) +{ + addressing::fill_nchw_patch_array( + sizeof(T), + reinterpret_cast(dest_row_pointers), + reinterpret_cast(dest_patch), + patch_rows, patch_cols, + reinterpret_cast(src_ptr), ld_row, ld_col, + reinterpret_cast(pad_row), + pad_top, valid_rows, + pad_left, valid_cols + ); +} + +template +inline void fill_patch_array_generic_kernel( + const T **dest_pointers, // Pointers: one per output row per kernel point + T *dest_patch, // Pointer to space which can be used to construct the patch + unsigned int output_rows, unsigned int output_cols, + unsigned int kernel_rows, unsigned int kernel_cols, + unsigned int stride_rows, unsigned int stride_cols, + const T *src_ptr, size_t ld_row, size_t ld_col, // Source tensor + const T *pad_row, // Pointer to a row of padding values + unsigned int pad_top, unsigned int valid_rows, + unsigned int pad_left, unsigned int valid_cols +) +{ + addressing::fill_patch_array_generic_kernel( + sizeof(T), + reinterpret_cast(dest_pointers), + reinterpret_cast(dest_patch), + output_rows, output_cols, + kernel_rows, kernel_cols, + stride_rows, stride_cols, + reinterpret_cast(src_ptr), ld_row, ld_col, + reinterpret_cast(pad_row), + pad_top, valid_rows, + pad_left, valid_cols + ); +} + +} // namespace {anonymous} +} // namespace arm_conv -- cgit v1.2.1