diff options
author | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-06-01 11:47:14 +0100 |
---|---|---|
committer | Viet-Hoa Do <viet-hoa.do@arm.com> | 2022-11-28 16:57:42 +0000 |
commit | 03b2971ac69a86f10a1566938d1a25afee15746c (patch) | |
tree | aec7cfc047e1da278b4b71a706cda7b1b0faa158 /src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp | |
parent | 7dc0234331f2150a6b4ac5c2b49de419870f7cf5 (diff) | |
download | ComputeLibrary-03b2971ac69a86f10a1566938d1a25afee15746c.tar.gz |
Integrate SME2 kernels
* Add SME/SME2 detection.
* Integrate SME2 implementation for:
- Normal convolution
- Winograd
- Depthwise convolution
- Pooling
Resolves: COMPMID-5700
Signed-off-by: Viet-Hoa Do <viet-hoa.do@arm.com>
Change-Id: I2f1ca1d05f8cfeee9309ed1c0a36096a4a6aad5c
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8692
Reviewed-by: Gunes Bayir <gunes.bayir@arm.com>
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp')
-rw-r--r-- | src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp | 271 |
1 files changed, 271 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp new file mode 100644 index 0000000000..4f25da2877 --- /dev/null +++ b/src/core/NEON/kernels/arm_gemm/interleave_indirect_impl.hpp @@ -0,0 +1,271 @@ +/* + * Copyright (c) 2022 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#pragma once + +// Implementations of interleave functions +// These must be included with a "namespace arm_gemm" block. + +/* + * Core function that does heavy lifting - interleave 'int_by' rows of width 'width' together. + * + * 'height' indicates the actual number of rows to interleave, so if it's less than int_by then the remaining + * entries are padded (note that this is "GEMM" padding rather than convolution padding, so there is no need to pad + * with a particular value. + * + * Note that it is not expected for this templated version to ever be used - all cases that matter should be + * explicitly specialized with an optimized implementation. + */ +template<unsigned int height_vectors, unsigned int block, VLType vlt, bool integrate_sums, typename TIn, typename TOut> +void interleave_block( TOut * &out, const TIn * const *in, size_t width, size_t height, size_t row_offset, bool first) { + const unsigned int int_by = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : + (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 )); + + std::vector<int32_t> the_sums; + + if (integrate_sums) { + the_sums = std::vector<int32_t>(int_by, 0); + + if (!first) { + // In 'integrate sums' mode, we dump the sums at the end on each pass. + + // On the last pass this is correct, but on other passes it is not - + // so on the subsequent pass we need to take the output written by + // the previous pass as starting point for the sums, and then + // overwrite them with new interleaved data. + int32_t *out_int32 = reinterpret_cast<int32_t *>(out); + + // Rewind pointer to where we wrote out the sums last time. + out_int32 -= int_by; + + // Restore the running sums. + memcpy(the_sums.data(), out_int32, int_by * sizeof(int32_t)); + + // Update the "real" pointer so that the next output will clobber the old sums. + out = reinterpret_cast<TOut *>(out_int32); + } + } + + for (unsigned int pos=0; pos<width; pos+=block) { + for (unsigned int row=0; row<int_by; row++) { + // Row out of range - pad 'block' entries. + if (row >= height) { + for (unsigned int col=0; col<block; col++) { + *out++ = 0; + } + continue; + } + + for (unsigned int col=0; col<block; col++) { + // Column out of range - pad a single entry + if (pos + col >= width) { + *out++ = 0; + continue; + } + + if (integrate_sums) { + the_sums[row] += in[row][row_offset + pos + col]; + } + + *out++ = in[row][row_offset + pos + col]; + } + } + } + + if (integrate_sums) { + int32_t *out_int32 = reinterpret_cast<int32_t *>(out); + + memcpy(out_int32, the_sums.data(), int_by * sizeof(int32_t)); + + out = reinterpret_cast<TOut *>(out_int32 + int_by); + } +} + +template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TOut> +inline void FixupRowSums(TOut * &out, const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : + (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 )); + + // If we are integrating row sums, we need to do some fix up, depending on whether the multiplier is non-zero or not. + if (row_sum_multiplier) { + // Non-zero: interleave_block<>() will have done the sums, so 'out' will point to the start of the + // next block (post sums). + // We need to go back and apply the multiplier to the computed sums. We don't need to change 'out'. + int32_t *out_int32 = reinterpret_cast<int32_t *>(out); + + out_int32 -= height; + for (unsigned int i=0; i<height; i++) { + out_int32[i] *= row_sum_multiplier; + } + } else { + // Zero: interleave_block<>() will *not* have done the sums, so 'out' will point to the start of the + // sum block. We need to insert the (zero) sums, and advance 'out'. + int32_t *out_int32 = reinterpret_cast<int32_t *>(out); + + for (unsigned int i=0; i<height; i++) { + out_int32[i] = 0; + } + + out_int32 += height; + + out = reinterpret_cast<TOut *>(out_int32); + } +} + +template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut> +void IndirectInterleave(TOut *out, const TIn * const * const *ptr, unsigned int stringlen, + unsigned int rounded_stringlen, const unsigned int y0, const unsigned int ymax, + const unsigned int k0, const unsigned int kmax, bool integrate_sums, + const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : + (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 )); + + // 'interleave_block' implementations are entitled to read a pointer for each row they handle from the input + // pointer array, even for out of range rows (although they must not subsequently dereference those pointers for + // out of range rows). This allows interleave_block to use techniques like row predication, or loading all + // pointers and conditionally overriding the out of range ones. + + // This is problematic in the "pure" indirect case when we get to the last rows, where it can lead to out of + // range reads. Avoid this with a local buffer to use in last-rows cases. Use alloca as a std::vector can be + // expensive in highly threaded scenarios. + const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *))); + + // Figure out the starting position based on k0 (with rounded length) + unsigned int start_string = k0 / rounded_stringlen; + unsigned int start_stringpos = k0 % rounded_stringlen; + + // Process blocks of 'height' height... + for (unsigned int ybase = y0; ybase < ymax; ybase+=height) { + // Height to process + unsigned int active_height = std::min(ymax - ybase, height); + + // Track our progress through the various strings + unsigned int k_left = (kmax - k0); + unsigned int string = start_string; + unsigned int stringpos = start_stringpos; + + bool first = true; + + // Prepare to call 'interleave_block' above for each string encompassed by K range + while (k_left > 0) { + // Width to process - and the width we will generate (with padding) + unsigned int in_width = std::min(k_left, stringlen - stringpos); + unsigned int out_width = std::min(k_left, rounded_stringlen - stringpos); + + const TIn * const *row_base = ptr[string] + ybase; + + // If not all rows are valid, copy the ones that are into local array (see above comment). + if (active_height < height) { + for (unsigned int i=0; i<active_height; i++) { + row_ptrs[i] = ptr[string][ybase + i]; + } + + row_base = row_ptrs; + } + + // 'integrate_sums' is a function parameter rather than a template parameter to prevent duplicating too + // much code. However, integrated sums make no sense for non-integral types and won't ever be + // requested. So put a type trait check here to avoid generating pointless code. + if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) { + interleave_block<height_vectors, block, vlt, true>(out, row_base, in_width, active_height, stringpos, first); + } else { + interleave_block<height_vectors, block, vlt, false>(out, row_base, in_width, active_height, stringpos, first); + } + + k_left -= out_width; + string++; + stringpos=0; + first=false; + } + + if (std::is_integral<TOut>::value && integrate_sums) { + FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier); + } + } +} + +template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut> +void ConvolutionInterleave(TOut *out, const TIn *in, size_t in_stride, const convolver<TIn> &conv, const unsigned int rounded_stringlen, + const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : + (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 )); + auto conv_cols = conv.process_columns(in, in_stride, k0, kmax, rounded_stringlen); + + // Use alloca here as a std::vector can be expensive in highly threaded scenarios. + const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *))); + + for (unsigned int ybase = y0; ybase < ymax; ybase += height) { + // How many of the rows are active - the rest will get padded in interleave_block. + unsigned int active_height = std::min(ymax - ybase, height); + bool first = true; + + auto conv_rows = conv_cols.process_rows(ybase, active_height); + + while (!conv_rows.finished()) { + unsigned int width, offset; + + // Get next set of parameters + std::tie(width, offset) = conv_rows.next_block(row_ptrs); + + // Perform the interleave + if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) { + interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, active_height, offset, first); + } else { + interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, active_height, offset, first); + } + + first=false; + } + + if (std::is_integral<TOut>::value && integrate_sums) { + FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier); + } + } +} + +template<unsigned int height_vectors, unsigned int block, VLType vlt, typename TIn, typename TOut> +void Interleave(TOut *out, const TIn *in, size_t in_stride, const unsigned int y0, const unsigned int ymax, const unsigned int k0, const unsigned int kmax, bool integrate_sums, const int32_t row_sum_multiplier) { + const unsigned int height = height_vectors * (vlt == VLType::SVE ? get_vector_length<TOut>() / block : + (vlt == VLType::SME ? sme::get_vector_length<TOut>() / block : 1 )); + // Use alloca here as a std::vector can be expensive in highly threaded scenarios. + const TIn **row_ptrs = reinterpret_cast<const TIn **>(alloca(height * sizeof(const TIn *))); + + const unsigned int width=kmax-k0; + + for (unsigned int y=y0; y<ymax; y+=height) { + for (unsigned int r=0; r<height; r++) { + row_ptrs[r] = in + ((y + r) * in_stride); + } + + if (std::is_integral<TOut>::value && integrate_sums && row_sum_multiplier) { + interleave_block<height_vectors, block, vlt, true>(out, row_ptrs, width, std::min(height, ymax-y), k0, true); + } else { + interleave_block<height_vectors, block, vlt, false>(out, row_ptrs, width, std::min(height, ymax-y), k0, true); + } + + if (std::is_integral<TOut>::value && integrate_sums) { + FixupRowSums<height_vectors, block, vlt>(out, row_sum_multiplier); + } + } +} |