From 8094f9dd5307c55f545b2cb41ec80a739a9b4d6f Mon Sep 17 00:00:00 2001 From: Pablo Marquez Tello Date: Thu, 12 Jan 2023 16:44:34 +0000 Subject: Remove unused code in arm_conv/depthwise/ * Removed header files in arm_conv/depthwise * Resolves MLCE-990 Change-Id: Iacddd80e2d83ff0fbafb817014f90c5bc80dab3c Signed-off-by: Pablo Marquez Tello Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/8946 Reviewed-by: Andrew Mundy Reviewed-by: Viet-Hoa Do Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins Tested-by: Arm Jenkins --- .../depthwise_depthfirst_generic_multiplier.hpp | 473 --------------------- ...ise_depthfirst_generic_multiplier_quantized.hpp | 127 ------ .../depthwise_depthfirst_generic_quantized.hpp | 125 ------ .../depthwise_depthfirst_multiplier_quantized.hpp | 251 ----------- .../depthwise/depthwise_depthfirst_quantized.hpp | 412 ------------------ 5 files changed, 1388 deletions(-) delete mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp delete mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp delete mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp delete mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp delete mode 100644 src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp deleted file mode 100644 index bb580e605a..0000000000 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier.hpp +++ /dev/null @@ -1,473 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" - -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif - -#include - -namespace arm_conv { -namespace depthwise { - -template -class DepthwiseDepthfirstGenericWithMultiplierBase : - public DepthwiseCommon -{ - protected: - - using TInput = typename strategy::input_type; - using TWeight = typename strategy::weight_type; - using TOutput = typename strategy::return_type; - using TAccum = typename strategy::bias_type; - - unsigned int kernel_points(void) const - { - return this->m_args.kernel_rows * this->m_args.kernel_cols; - } - - unsigned int input_rows(void) const - { - return (strategy::output_rows() - 1) * this->m_args.stride_rows + this->m_args.kernel_rows; - } - - unsigned int input_cols(void) const - { - return (strategy::output_cols() - 1) * this->m_args.stride_cols + this->m_args.kernel_cols; - } - - size_t sizeof_inptr_array(void) const - { - return sizeof(TInput *) * kernel_points() * strategy::output_rows(); - } - - size_t sizeof_input_samples(void) const - { - // We have a sample for each kernel point, for each point of the output array. - return sizeof(TInput) * kernel_points() * - strategy::output_rows() * - strategy::output_col_regs() * - (16 / sizeof(TAccum)); - } - - size_t sizeof_outptr_array(void) const - { - return sizeof(TOutput *) * strategy::output_rows() * strategy::output_cols(); - } - - size_t sizeof_output_buffer(unsigned int n_channels) const - { - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(n_channels, vl); - return sizeof(TOutput) * rounded_channels; - } - - void pack_weights(TWeight *buffer, const TWeight *weights, size_t ld_weight_col, size_t ld_weight_row) const - { - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - ld_weight_col = (ld_weight_col == 0) ? this->m_args.channel_multiplier * this->m_args.input_channels : ld_weight_col; - ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; - - for (unsigned int in_c = 0; in_c < this->m_args.input_channels; in_c++) - { - for (unsigned int n = 0; n < this->m_args.channel_multiplier; n += vl) - { - const unsigned int out_c = in_c * this->m_args.channel_multiplier + n; - const unsigned int todo = std::min(vl, this->m_args.channel_multiplier - n); - - // Copy each of the weights in turn - auto weights_row = weights + out_c; - for (unsigned int i = 0; i < this->m_args.kernel_rows; i++) - { - auto weights_col = weights_row; - - for (unsigned int j = 0; j < this->m_args.kernel_cols; j++) - { - for (unsigned int m = 0; m < todo; m++) - { - buffer[m] = weights_col[m]; - } - buffer += vl; - - weights_col += ld_weight_col; - } - - weights_row += ld_weight_row; - } - } - } - } - - void execute_tiles( - std::function tile_fn, - const TInput pad_value, - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const - { -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif - - // Determine what portion of the work to do. - const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); - const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); - const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); - - // Need a stride over blocks of parameters - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const unsigned int param_stride = arm_gemm::roundup(this->m_args.channel_multiplier, vl) * kernel_points(); - - // Cast input and output pointers into the right types - const TInput *const inptr = static_cast(_input); - TOutput *const outptr = static_cast(_output); - - // Allocate portions of the working space - uint8_t *working_space = static_cast(_working_space) + - get_working_size(thread_id, input_channels); - - const TInput **inptrs = reinterpret_cast(working_space); - working_space += sizeof_inptr_array(); - - // To simplify the kernel, we process padded or non-NCHW-ordered input into - // a form which can be consumed by the kernel. This data is stored here and - // passed into the kernel as an array of N pointers (one per row of the - // input). - TInput *rearranged_input = reinterpret_cast(working_space); - working_space += sizeof_input_samples(); - - TOutput **outptr_array = reinterpret_cast(working_space); - working_space += sizeof_outptr_array(); - - TOutput *const output_buffer = reinterpret_cast(working_space); - - // TODO Dynamically change the input pointer array in cases where we could - // read directly from the input tensor; for now though assume we will - // always read from the sample array. - { - auto my_inptrs = inptrs; - auto my_input_samples = rearranged_input; - - // For each kernel point; for each row of output; for each register of - // values containing a QUAD of source values. - const unsigned int quad_length = 16 / sizeof(TAccum); - - for (auto p = 0u; p < kernel_points() * strategy::output_rows(); p++) - { - *(my_inptrs)++ = my_input_samples; - my_input_samples += arm_gemm::roundup(strategy::output_cols(), quad_length); - } - } - - // For each output tile, construct the requisite set of pointers and call - // into the kernel. - for (unsigned int batch = 0; batch < batches; batch++) - { - // Get batch pointers - const auto inptr_batch = inptr + batch * ld_input_batch; - const auto outptr_batch = outptr + batch * ld_output_batch; - - for (int start_out_i = start_out_height; - start_out_i < end_out_height; - start_out_i += static_cast(strategy::output_rows())) - { - const int end_out_i = std::min(start_out_i + static_cast(strategy::output_rows()), end_out_height); - const int start_in_i = start_out_i * this->m_args.stride_rows - padding.top; - const int end_in_i = start_in_i + input_rows(); - - // Compute top/bottom padding - const auto pad_top = static_cast(-std::min(start_in_i, 0)); - const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); - const unsigned int valid_output_rows = std::min( - end_out_i - start_out_i, - static_cast(output_height) - start_out_i - ); - - const int pad_rows = pad_top + pad_bottom; - - for (int start_out_j = 0; start_out_j < static_cast(output_width);) - { - const int start_in_j = start_out_j * this->m_args.stride_cols - this->m_args.padding.left; - const int pad_left = -std::min(0, start_in_j); - - const int end_out_j = start_out_j + strategy::output_cols(); - const int end_in_j = start_in_j + input_cols(); - - const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); - const unsigned int valid_output_cols = std::min( - end_out_j - start_out_j, - static_cast(output_width) - start_out_j - ); - - const int pad_cols = pad_left + pad_right; - - // Construct the output pointer array. - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < valid_output_rows; i++) - { - unsigned int j = 0u; - TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; - for (; j < valid_output_cols; j++) - { - *(outptr_pos++) = colptr; - colptr += ld_output_col; - } - for (; j < strategy::output_cols(); j++) - { - *(outptr_pos++) = output_buffer; - } - } - for (auto i = valid_output_rows; i < strategy::output_rows(); i++) - { - for (auto j = 0u; j < strategy::output_cols(); j++) - { - *(outptr_pos++) = output_buffer; - } - } - - start_out_j += strategy::output_cols(); - - const TWeight *params = static_cast(parameters); - - // Fill the input samples with padding. We can do this outside of - // the channel loop, as the position of padding isn't going to - // change as a function of channel. - for (auto i = 0u; i < kernel_points() * strategy::output_rows() * strategy::output_cols(); i++) - { - rearranged_input[i] = pad_value; - } - - // Loop over the input channels - for (unsigned int in_c = 0; in_c < input_channels; in_c++) - { - auto inptr_row = inptr_batch + in_c + - (start_in_i + pad_top) * ld_input_row + - (start_in_j + pad_left) * ld_input_col; - - // Construct the array of input samples; for each point of the - // kernel we provide an input value for each output point. - auto input_samples = rearranged_input; - for (auto ki = 0u; ki < this->m_args.kernel_rows; ki++) - { - for (auto kj = 0u; kj < this->m_args.kernel_cols; kj++) - { - // Copy the pointer for the input samples associated with this - // kernel point. Then update the main pointer to account for - // this point. - auto point_input_samples = input_samples; - input_samples += strategy::output_rows() * strategy::output_cols(); - - int ii = static_cast(ki) - static_cast(pad_top); - for (auto oi = 0u; - oi < strategy::output_rows() && - ii < static_cast(input_rows()) - pad_rows; - oi++, ii += this->m_args.stride_rows) - { - if (0 <= ii) // Fill in values only if this row is in range. - { - int ij = static_cast(kj) - static_cast(pad_left); - for (auto oj = 0u; - oj < strategy::output_cols() && - ij < static_cast(input_cols()) - pad_cols; - oj++, ij += this->m_args.stride_cols) - { - if (0 <= ij) // Sample if the point is in range. - { - point_input_samples[oj] = *(inptr_row + ii*ld_input_row + ij*ld_input_col); - } - } - } - - point_input_samples += strategy::output_cols(); - } - } - } - - tile_fn(inptrs, outptr_array, params, in_c, in_c*this->m_args.channel_multiplier); - - // Progress the output pointers - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < strategy::output_rows() * strategy::output_cols(); i++) - { - outptr_pos[i] += this->m_args.channel_multiplier; - } - - // Progress the pointer into the parameters - params += param_stride; - } - } - } - } - } - - public: - DepthwiseDepthfirstGenericWithMultiplierBase(const DepthwiseArgs &args) : DepthwiseCommon(args) - { - } - - DepthwiseDepthfirstGenericWithMultiplierBase(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete; - DepthwiseDepthfirstGenericWithMultiplierBase &operator=(DepthwiseDepthfirstGenericWithMultiplierBase &) = delete; - - size_t get_storage_size(void) const override - { - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const auto rounded_channels = this->m_args.input_channels * arm_gemm::roundup(this->m_args.channel_multiplier, vl); - return kernel_points() * rounded_channels * sizeof(TWeight); - } - - size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override - { - const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; - return n_threads * (sizeof_inptr_array() + - sizeof_input_samples() + - sizeof_outptr_array() + - sizeof_output_buffer(n_output_channels)); - } -}; - -template -class DepthwiseDepthfirstGenericWithMultiplier : public DepthwiseDepthfirstGenericWithMultiplierBase -{ - using TInput = typename strategy::input_type; - using TWeight = typename strategy::weight_type; - using TOutput = typename strategy::return_type; - using TAccum = typename strategy::bias_type; - - using Parent = DepthwiseDepthfirstGenericWithMultiplierBase; - - const TAccum *m_biases; // Pointer to bias vector - - public: - DepthwiseDepthfirstGenericWithMultiplier(const DepthwiseArgs &args) - : Parent(args), m_biases(nullptr) - { - } - - DepthwiseDepthfirstGenericWithMultiplier(DepthwiseDepthfirstGenericWithMultiplier &) = delete; - DepthwiseDepthfirstGenericWithMultiplier &operator=(DepthwiseDepthfirstGenericWithMultiplier &) = delete; - - void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override - { - m_biases = static_cast(biases); - Parent::pack_weights(static_cast(buffer), static_cast(weights), ld_weight_col, ld_weight_row); - } - - using DepthwiseDepthfirstGenericWithMultiplierBase::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const override - { - strategy strat(this->m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif - - // Compute activation values - TAccum activation_min, activation_max; - std::tie(activation_min, activation_max) = get_default_activation_values(); - - switch (this->m_args.activation.type) - { - case arm_gemm::Activation::Type::BoundedReLU: - activation_max = static_cast(this->m_args.activation.param1); - // Fall through - case arm_gemm::Activation::Type::ReLU: - activation_min = static_cast(0); - break; - default: - break; - } - - // Get a function to call for each point of the output - auto tile_fn = [&] (const TInput **inptrs, - TOutput **outptrs, - const TWeight *weights, - const unsigned int, - const unsigned int start_output_channel) { -#ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols)); -#endif - strat.kernel( - inptrs, outptrs, weights, - m_biases ? m_biases + start_output_channel : nullptr, - this->kernel_points(), this->m_args.channel_multiplier, - activation_min, activation_max - ); - }; - - Parent::execute_tiles( - tile_fn, 0.0f, - batches, input_height, input_width, input_channels, padding, - _input, ld_input_col, ld_input_row, ld_input_batch, - parameters, - output_height, output_width, - _output, ld_output_col, ld_output_row, ld_output_batch, - _working_space, thread_id, n_threads - ); - } -}; - -} // namespace depthwise -} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp deleted file mode 100644 index d42382e208..0000000000 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_multiplier_quantized.hpp +++ /dev/null @@ -1,127 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" - -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif - -#include "depthwise_depthfirst_generic_multiplier.hpp" - -namespace arm_conv { -namespace depthwise { - -template -class DepthwiseDepthfirstGenericWithMultiplierQuantized : public DepthwiseDepthfirstGenericWithMultiplierBase -{ - using TInput = typename strategy::input_type; - using TWeight = typename strategy::weight_type; - using TOutput = typename strategy::return_type; - using TAccum = typename strategy::bias_type; - - using Parent = DepthwiseDepthfirstGenericWithMultiplierBase; - - arm_gemm::Requantize32 m_qp; - - public: - DepthwiseDepthfirstGenericWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp) - : Parent(args), m_qp(qp) - { - } - - DepthwiseDepthfirstGenericWithMultiplierQuantized(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete; - DepthwiseDepthfirstGenericWithMultiplierQuantized &operator=(DepthwiseDepthfirstGenericWithMultiplierQuantized &) = delete; - - void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override - { - m_qp.bias = static_cast(biases); - Parent::pack_weights(static_cast(buffer), static_cast(weights), ld_weight_col, ld_weight_row); - } - - using Parent::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const override - { - strategy strat(this->m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif - - // Get a function to call for each point of the output - auto tile_fn = [&] (const TInput **inptrs, - TOutput **outptrs, - const TWeight *weights, - const unsigned int, - const unsigned int start_output_channel) { -#ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows() * strategy::output_cols() * this->m_args.channel_multiplier * this->m_args.kernel_rows * this->m_args.kernel_cols)); -#endif - strat.kernel( - inptrs, outptrs, weights, - m_qp.bias == nullptr ? nullptr : m_qp.bias + start_output_channel, - this->kernel_points(), - this->m_args.channel_multiplier, - m_qp.per_channel_left_shifts == nullptr ? nullptr : m_qp.per_channel_left_shifts + start_output_channel, - m_qp.per_channel_muls == nullptr ? nullptr : m_qp.per_channel_muls + start_output_channel, - m_qp.per_channel_right_shifts == nullptr ? nullptr : m_qp.per_channel_right_shifts + start_output_channel, - m_qp - ); - }; - - Parent::execute_tiles( - tile_fn, m_qp.a_offset, - batches, input_height, input_width, input_channels, padding, - _input, ld_input_col, ld_input_row, ld_input_batch, - parameters, - output_height, output_width, - _output, ld_output_col, ld_output_row, ld_output_batch, - _working_space, thread_id, n_threads - ); - } -}; - -} // namespace depthwise -} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp deleted file mode 100644 index cfb0d4bc05..0000000000 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic_quantized.hpp +++ /dev/null @@ -1,125 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include "depthwise_depthfirst_generic.hpp" - -#include "arm_gemm.hpp" -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" - -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif - -using arm_gemm::Requantize32; - -namespace arm_conv { -namespace depthwise { - -template -class DepthwiseDepthfirstGenericQuantized : public DepthwiseDepthfirstGenericBase -{ - using Parent = DepthwiseDepthfirstGenericBase; - using TInput = typename Parent::TInput; - using TAccum = typename Parent::TAccum; - using TOutput = typename Parent::TOutput; - - Requantize32 m_qp; - - public: - DepthwiseDepthfirstGenericQuantized(const DepthwiseArgs &args, const Requantize32 &qp) - : Parent(args), m_qp(qp) - { - } - - DepthwiseDepthfirstGenericQuantized(DepthwiseDepthfirstGenericQuantized &) = delete; - DepthwiseDepthfirstGenericQuantized &operator=(DepthwiseDepthfirstGenericQuantized &) = delete; - - void pack_parameters(void *buffer, const void *biases, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override - { - m_qp.bias = static_cast(biases); - Parent::pack_parameters(buffer, biases, weights, ld_weight_col, ld_weight_row); - } - - using DepthwiseDepthfirstGenericBase::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const override - { - Strategy strat(this->m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif - - // Create a function to initialise the input buffer - const auto initialise_input_buffer = [this] (TInput *const buffer, const unsigned int n) { - std::memset(buffer, static_cast(m_qp.a_offset), n * sizeof(TInput)); - }; - - // Create a function to execute a tile of work - const auto tile_fn = [&] (const TInput *const *const inptrs, TOutput *const * const outptrs) { -#ifdef CYCLE_PROFILING - auto p = prof.ScopedProfiler( - PROFILE_KERNEL, - (unsigned long) (OutputRows * OutputCols * this->m_args.kernel_rows* this->m_args.kernel_cols) - ); -#endif - strat.kernel(inptrs, outptrs, parameters, m_qp, - this->m_args.kernel_rows * this->m_args.kernel_cols, - this->m_args.input_channels); - }; - - // Call into a parent utility function to do the actual work. - Parent::execute_tiles( - tile_fn, initialise_input_buffer, - batches, input_height, input_width, input_channels, padding, - _input, ld_input_col, ld_input_row, ld_input_batch, - output_height, output_width, - _output, ld_output_col, ld_output_row, ld_output_batch, - _working_space, thread_id, n_threads - ); - } -}; - -} // namespace depthwise -} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp deleted file mode 100644 index 07ce0d3b55..0000000000 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_multiplier_quantized.hpp +++ /dev/null @@ -1,251 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include "depthwise_depthfirst_multiplier.hpp" - -namespace arm_conv { -namespace depthwise { - -template -class DepthwiseDepthfirstWithMultiplierQuantized : - public DepthwiseCommon -{ - using Parent = DepthwiseCommon; - using TInput = typename strategy::input_type; - using TWeight = typename strategy::weight_type; - using TOutput = typename strategy::return_type; - - const arm_gemm::Requantize32 m_qp; - - size_t sizeof_output_buffer(unsigned int n_channels) const - { - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(n_channels, vl); - return sizeof(typename strategy::return_type) * rounded_channels; - } - - public: - DepthwiseDepthfirstWithMultiplierQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp) - : Parent(args), m_qp(qp) - { - } - - DepthwiseDepthfirstWithMultiplierQuantized(DepthwiseDepthfirstWithMultiplierQuantized &) = delete; - DepthwiseDepthfirstWithMultiplierQuantized &operator=(DepthwiseDepthfirstWithMultiplierQuantized &) = delete; - - size_t get_storage_size(void) const override - { - // We produce VL channels at a time, for each of these blocks of - // channels we store a vector of biases, weights (complicated) and - // requantize parameters. - const unsigned int iter_length = - arm_gemm::utils::get_vector_length(strategy::vl_type); - const unsigned int n_iters = - this->m_args.input_channels * arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length); - - // Compute the cost of storing the weights - const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u); - - return n_iters * iter_length * ( - sizeof(int32_t) + // Bias - 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(TWeight) + // Weights - 2 * sizeof(int32_t) // Requantisation parameters - ); - } - - // We'll want an optimised version of this, but for now a C++ implementation - // is probably sufficient. - void pack_parameters(void *_buffer, const void *_biases, const void *_weights, size_t ld_weight_col, size_t ld_weight_row) override - { - auto buffer = static_cast(_buffer); - auto biases = static_cast(_biases); - auto weights = static_cast(_weights); - auto requant_muls = m_qp.per_channel_muls; - auto requant_shifts = m_qp.per_channel_right_shifts; - - const unsigned int iter_length = - arm_gemm::utils::get_vector_length(strategy::vl_type); - const unsigned int n_iters_per_input_channel = - arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length); - - const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u); - - const size_t iter_stride = iter_length * ( - sizeof(int32_t) + // Bias - 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights - 2 * sizeof(int32_t) // Requantisation parameters - ); - - ld_weight_col = (ld_weight_col == 0) ? this->m_args.input_channels * this->m_args.channel_multiplier : ld_weight_col; - ld_weight_row = (ld_weight_row == 0) ? this->m_args.kernel_cols * ld_weight_col : ld_weight_row; - - for (unsigned int input_channel = 0; input_channel < this->m_args.input_channels; input_channel++) - { - auto buffer_input_channel = buffer + input_channel * n_iters_per_input_channel * iter_stride; - auto weights_input_channel = weights + input_channel * this->m_args.channel_multiplier; - - for (unsigned int iter = 0; iter < n_iters_per_input_channel; iter++) - { - // Get a pointer to the start of this portion of the buffer; consequently - // derive pointers to the bias, weight and requantisation portions of - // this frame. - auto buffer_base = buffer_input_channel + iter_stride * iter; - auto buffer_biases = reinterpret_cast(buffer_base); - auto buffer_weights = buffer_base + sizeof(int32_t) * iter_length; - auto buffer_requant_mul = reinterpret_cast( - buffer_weights + strategy::kernel_rows * n_dots_per_kernel_row * 4 * iter_length); - auto buffer_requant_shift = buffer_requant_mul + iter_length; - auto weights_base = weights_input_channel + iter * iter_length; - - // Hence work through the data for this iteration, on a - // channel-by-channel basis. - const auto this_iter_length = std::min( - iter_length, this->m_args.channel_multiplier - iter * iter_length - ); - for (unsigned int i = 0; i < this_iter_length; i++) - { - auto weights_channel = weights_base + i; - - // Read the bias value, we modify this as we read the weights. - auto bias_value = biases == nullptr ? 0 : *(biases++); - int32_t elements_sum = 0; - - // Read through the kernel; for each row, marshal together as many dot - // product terms as are required. - for (unsigned int ki = 0; ki < strategy::kernel_rows; ki++) - { - auto buffer_row = buffer_weights + i*4 + ki * 4 * n_dots_per_kernel_row * iter_length; - auto weights_row = weights_channel + ki * ld_weight_row; - - unsigned int kj = 0; - for (; kj < strategy::kernel_cols; kj++) - { - // Determine which element to which we're writing - const auto dot = kj / 4; - const auto elem = kj % 4; - - // Copy the value; include in the sum - const auto val = weights_row[kj * ld_weight_col]; - buffer_row[dot * 4 * iter_length + elem] = val; - elements_sum += val; - } - for (; kj < 4 * n_dots_per_kernel_row; kj++) - { - const auto dot = kj / 4; - const auto elem = kj % 4; - buffer_row[dot * 4 * iter_length + elem] = 0; - } - - buffer_row += 4 * n_dots_per_kernel_row * iter_length; - } - - // Write back the bias and offset values - *(buffer_biases++) = - bias_value - m_qp.a_offset * elements_sum + - strategy::kernel_rows * strategy::kernel_cols * m_qp.a_offset * m_qp.b_offset; - - // Write out the requantisation parameters - *(buffer_requant_mul++) = m_qp.per_channel_requant ? *(requant_muls++) : m_qp.per_layer_mul; - *(buffer_requant_shift++) = m_qp.per_channel_requant ? *(requant_shifts++) : m_qp.per_layer_right_shift; - } - } - } - } - - size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override - { - const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; - return n_threads * sizeof_output_buffer(n_output_channels); - } - - using Parent::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *const _working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const override - { - strategy strat(this->m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif - - auto executefn = [strat, this] ( - const TInput *const *const inptrs, - TOutput *const *const outptr_array, - const void *const params - ) { - strat.kernel(inptrs, outptr_array, params, this->m_args.channel_multiplier, m_qp); - }; - - // Get working space for this thread - uint8_t *const working_space = static_cast(_working_space) + get_working_size(1, input_channels) * thread_id; - - // Determine the stride across blocks of parameters - const unsigned int iter_length = - arm_gemm::utils::get_vector_length(strategy::vl_type); - const unsigned int n_iters_per_input_channel = arm_gemm::iceildiv(this->m_args.channel_multiplier, iter_length); - const unsigned int n_dots_per_kernel_row = arm_gemm::iceildiv(strategy::kernel_cols, 4u); - const size_t param_stride = n_iters_per_input_channel * iter_length * ( - sizeof(int32_t) + // Bias - 4 * n_dots_per_kernel_row * strategy::kernel_rows * sizeof(int8_t) + // Weights - 2 * sizeof(int32_t) // Requantisation parameters - ); - - common::depthwise_multiplier_execute( - executefn, m_qp.a_offset, this->m_args, - batches, input_height, input_width, input_channels, padding, - _input, ld_input_col, ld_input_row, ld_input_batch, - parameters, param_stride, - output_height, output_width, - _output, ld_output_col, ld_output_row, ld_output_batch, - working_space, thread_id, n_threads - ); - } -}; - -} // namespace depthwise -} // namespace arm_conv diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp deleted file mode 100644 index f97569e958..0000000000 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_quantized.hpp +++ /dev/null @@ -1,412 +0,0 @@ -/* - * Copyright (c) 2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ - -#pragma once - -#include "src/core/NEON/kernels/arm_gemm/utils.hpp" - -#ifdef CYCLE_PROFILING -#include "profiler.hpp" -#endif - -namespace arm_conv { -namespace depthwise { - -namespace -{ - -// We have two sets of quantized kernels; those which use the dot-product -// instructions and which require the biases and quantisation parameters to be -// ravelled into weights/parameter array, and those which use the MLAL -// instructions and which consume separate bias and quantisation parameter -// arrays. The following code adapts these two sets of kernels to use the same -// API - allowing the same driver loop to call them both. - -template -using UnravelledKernFn = std::function; - -template -using RavelledKernFn = std::function; - -template -const UnravelledKernFn get_unified_kernel(const UnravelledKernFn &f) { return f; } - -template -const UnravelledKernFn get_unified_kernel(const RavelledKernFn &f) -{ - return [f] (const unsigned int n_channels, - const TIn *const *const inptrs, - const TWeight *const weights, - const int32_t *, // Bias (ravelled) - const arm_gemm::Requantize32 &qp, - const int32_t *, // Requantisation muls (ravelled) - const int32_t *, // Requantisation shifts (ravelled) - TOut *const *const outptrs) { - return f(inptrs, outptrs, weights, n_channels, qp); - }; -} - -template -using UnravelledPackingFn = std::function; - -template -using RavelledPackingFn = std::function; - -template -const RavelledPackingFn get_unified_packer(const UnravelledPackingFn &f) -{ - return [f] (const unsigned int n_channels, - void *buffer, - const int32_t *, // Bias - const T *weights, - const arm_gemm::Requantize32 &, - size_t ld_weight_col, - size_t ld_weight_row) - { - return f(n_channels, buffer, weights, ld_weight_col, ld_weight_row); - }; -} - -template -const RavelledPackingFn get_unified_packer(const RavelledPackingFn &f) { return f; } - -template -constexpr bool requires_unravelled_bias_and_quant_params(const UnravelledPackingFn &) { return true; } - -template -constexpr bool requires_unravelled_bias_and_quant_params(const RavelledPackingFn &) { return false; } - -template -constexpr bool strategy_requires_unravelled_bias_and_quant_params(void) -{ - return requires_unravelled_bias_and_quant_params(strategy::pack_parameters); -} - -} - -template -class DepthwiseDepthfirstQuantized : - public DepthwiseCommon -{ - using TInput = typename strategy::input_type; - using TWeight = typename strategy::weight_type; - using TOutput = typename strategy::return_type; - using TAccum = typename strategy::bias_type; - - arm_gemm::Requantize32 m_qp; - - size_t sizeof_input_buffer(unsigned int n_channels) const - { - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(n_channels, vl); - return sizeof(TInput) * rounded_channels; - } - - size_t sizeof_output_buffer(unsigned int n_channels) const - { - const unsigned int vl = arm_gemm::utils::get_vector_length(strategy::vl_type); - const auto rounded_channels = arm_gemm::roundup(n_channels, vl); - return sizeof(TOutput) * rounded_channels; - } - - size_t sizeof_bias_buffer(unsigned int n_channels) const - { - if (strategy_requires_unravelled_bias_and_quant_params()) - { - return (m_qp.bias == nullptr) ? sizeof(TAccum) * n_channels : 0; - } - - return 0; - } - - size_t sizeof_requant_mul_buffer(unsigned int n_channels) const - { - if (strategy_requires_unravelled_bias_and_quant_params()) - { - return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels; - } - - return 0; - } - - size_t sizeof_requant_shift_buffer(unsigned int n_channels) const - { - if (strategy_requires_unravelled_bias_and_quant_params()) - { - return m_qp.per_channel_requant ? 0 : sizeof(int32_t) * n_channels; - } - - return 0; - } - - public: - DepthwiseDepthfirstQuantized(const DepthwiseArgs &args, const arm_gemm::Requantize32 &qp) - : DepthwiseCommon(args), m_qp(qp) - { - } - - DepthwiseDepthfirstQuantized(DepthwiseDepthfirstQuantized &) = delete; - DepthwiseDepthfirstQuantized &operator=(DepthwiseDepthfirstQuantized &) = delete; - - size_t get_storage_size(void) const override - { - return strategy::get_packed_size(this->m_args); - } - - void pack_parameters(void *buffer, const void *const bias, const void *weights, size_t ld_weight_col, size_t ld_weight_row) override - { - if (strategy_requires_unravelled_bias_and_quant_params()) - { - m_qp.bias = static_cast(bias); - } - - get_unified_packer(strategy::pack_parameters)( - this->m_args.input_channels, - buffer, - static_cast(bias), - reinterpret_cast(weights), - m_qp, - ld_weight_col, - ld_weight_row - ); - } - - size_t get_working_size(const unsigned int n_threads, const unsigned int n_channels) const override - { - const unsigned int n_output_channels = n_channels * this->m_args.channel_multiplier; - return n_threads * ( - sizeof_output_buffer(n_output_channels) + - sizeof_input_buffer(n_channels) + - sizeof_bias_buffer(n_channels) + - sizeof_requant_mul_buffer(n_channels) + - sizeof_requant_shift_buffer(n_channels) - ); - } - - using DepthwiseCommon::execute; - void execute( - const unsigned int batches, - const unsigned int input_height, - const unsigned int input_width, - const unsigned int input_channels, - const PaddingValues &padding, - const void *const _input, - const size_t ld_input_col, - const size_t ld_input_row, - const size_t ld_input_batch, - const void *const parameters, - const unsigned int output_height, - const unsigned int output_width, - void *const _output, - const size_t ld_output_col, - const size_t ld_output_row, - const size_t ld_output_batch, - void *_working_space, - const unsigned int thread_id, - const unsigned int n_threads - ) const override - { - strategy strat(this->m_args.cpu_info); -#ifdef CYCLE_PROFILING - arm_gemm::profiler prof; -#endif - // Get a unified API for the kernel function - auto kernel = get_unified_kernel(strat.kernel); - - // Determine what portion of the work to do. - const unsigned int n_rows_per_thread = arm_gemm::iceildiv(output_height, n_threads); - const int start_out_height = std::min(thread_id * n_rows_per_thread, output_height); - const int end_out_height = std::min(start_out_height + n_rows_per_thread, output_height); - - // Cast input and output pointers into the right types - const TInput *const inptr = static_cast(_input); - TOutput *const outptr = static_cast(_output); - - // Create an array for the input pointers - const TInput * _inptr_array[strategy::input_rows * strategy::input_cols]; - const TInput **const inptr_array = _inptr_array; - - // Create an array for the output pointers - TOutput * _outptr_array[strategy::output_rows * strategy::output_cols]; - TOutput **const outptr_array = _outptr_array; - - // Allocate portions of the working space - uint8_t *working_space = static_cast(_working_space) + get_working_size(thread_id, input_channels); - - TOutput *const output_buffer = reinterpret_cast(working_space); - working_space += sizeof_output_buffer(input_channels * this->m_args.channel_multiplier); - - TInput *const input_buffer = reinterpret_cast(working_space); - working_space += sizeof_input_buffer(input_channels); - - const int32_t *const bias_ptr = (m_qp.bias == nullptr) ? reinterpret_cast(working_space) - : m_qp.bias; - working_space += sizeof_bias_buffer(input_channels * this->m_args.channel_multiplier); - - const int32_t *const requant_mul_vec = !m_qp.per_channel_requant ? reinterpret_cast(working_space) - : m_qp.per_channel_muls; - working_space += sizeof_requant_mul_buffer(input_channels * this->m_args.channel_multiplier); - - const int32_t *const requant_shift_vec = !m_qp.per_channel_requant ? reinterpret_cast(working_space) - : m_qp.per_channel_right_shifts; - - if (strategy_requires_unravelled_bias_and_quant_params()) - { - // Initialise the bias buffer - if (m_qp.bias == nullptr) - { - for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++) - { - const_cast(bias_ptr)[c] = 0; - } - } - - // Initialise the requantisation parameters - if (!m_qp.per_channel_requant) - { - for (unsigned int c = 0; c < input_channels * this->m_args.channel_multiplier; c++) - { - const_cast(requant_mul_vec)[c] = m_qp.per_layer_mul; - const_cast(requant_shift_vec)[c] = m_qp.per_layer_right_shift; - } - } - } - - // Initialise the input buffer - for (unsigned int c = 0; c < input_channels; c++) - { - input_buffer[c] = static_cast(m_qp.a_offset); - } - - // For each output tile, construct the requisite set of pointers and call - // into the kernel. - for (unsigned int batch = 0; batch < batches; batch++) - { - // Get batch pointers - const auto inptr_batch = inptr + batch * ld_input_batch; - const auto outptr_batch = outptr + batch * ld_output_batch; - - for (int start_out_i = start_out_height; - start_out_i < end_out_height; - start_out_i += static_cast(strategy::output_rows)) - { - const int end_out_i = start_out_i + strategy::output_rows; - const int start_in_i = start_out_i * strategy::stride_rows - padding.top; - const int end_in_i = start_in_i + strategy::input_rows; - - // Compute top/bottom padding - const auto pad_top = static_cast(-std::min(start_in_i, 0)); - const auto pad_bottom = static_cast(-std::min(static_cast(input_height) - end_in_i, 0)); - const unsigned int valid_output_rows = std::min( - end_out_i - start_out_i, - static_cast(output_height) - start_out_i - ); - - // Fill the input pointer array with padding values - for (auto index = 0u; index < strategy::input_rows * strategy::input_cols; index++) - { - inptr_array[index] = input_buffer; - } - - for (int start_out_j = 0; start_out_j < static_cast(output_width);) - { - const int start_in_j = start_out_j * strategy::stride_cols - this->m_args.padding.left; - const int pad_left = -std::min(0, start_in_j); - - const int end_out_j = start_out_j + strategy::output_cols; - const int end_in_j = start_in_j + strategy::input_cols; - - const auto pad_right = static_cast(-std::min(static_cast(input_width) - end_in_j, 0)); - const unsigned int valid_output_cols = std::min( - end_out_j - start_out_j, - static_cast(output_width) - start_out_j - ); - - // Construct the input pointer array - fill the array with pointers to - // the input buffer and then fill in the required values. - for (auto i = pad_top; i < strategy::input_rows - pad_bottom; i++) - { - // Can skip over the left padding because we will have either the - // same or less than the previous tile. - unsigned int j = pad_left; - const TInput *colptr = inptr_batch + (start_in_i + i) * ld_input_row + (start_in_j + j) * ld_input_col; - const TInput **ptrs = inptr_array + i * strategy::input_cols + j; - for (; j < strategy::input_cols - pad_right; j++) - { - *(ptrs++) = colptr; - colptr += ld_input_col; - } - for (; j < strategy::input_cols; j++) - { - *(ptrs++) = input_buffer; - } - } - - // Construct the output pointer array. - TOutput **outptr_pos = outptr_array; - for (auto i = 0u; i < valid_output_rows; i++) - { - unsigned int j = 0u; - TOutput *colptr = outptr_batch + (start_out_i + i) * ld_output_row + start_out_j * ld_output_col; - for (; j < valid_output_cols; j++) - { - *(outptr_pos++) = colptr; - colptr += ld_output_col; - } - for (; j < strategy::output_cols; j++) - { - *(outptr_pos++) = output_buffer; - } - } - for (auto i = valid_output_rows; i < strategy::output_rows; i++) - { - for (auto j = 0u; j < strategy::output_cols; j++) - { - *(outptr_pos++) = output_buffer; - } - } - - start_out_j += strategy::output_cols; - -#ifdef CYCLE_PROFILING - // TODO Work number - auto p = prof.ScopedProfiler(PROFILE_KERNEL, (unsigned long)(strategy::output_rows * strategy::output_cols * this->m_args.kernel_rows * this->m_args.kernel_cols)); -#endif - kernel( - this->m_args.input_channels, - inptr_array, - reinterpret_cast(parameters), - bias_ptr, m_qp, requant_mul_vec, requant_shift_vec, - outptr_array - ); - } - } - } - } -}; - -} // namespace depthwise -} // namespace arm_conv -- cgit v1.2.1