From 8deee9bd9b9137c256c23b86be11dbf0466f3aa8 Mon Sep 17 00:00:00 2001 From: Michael Tyler Date: Fri, 30 Jun 2023 11:26:05 +0100 Subject: Depthwise channel pre-multiplication Resolves: COMPMID-6337 Change-Id: Ie9097b3f56e8071426c621386a5988bd7f7e8ef2 Signed-off-by: Michael Tyler Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9852 Tested-by: Arm Jenkins Reviewed-by: Viet-Hoa Do Benchmark: Arm Jenkins --- .../depthwise/depthwise_depthfirst_generic.hpp | 45 ++++++++++++++-------- 1 file changed, 28 insertions(+), 17 deletions(-) (limited to 'src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp') diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp index b058ce26f2..ca5026b6e0 100644 --- a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp +++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_depthfirst_generic.hpp @@ -99,7 +99,7 @@ class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommonget_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), - false, sizeof(TAccum), // Don't pack the bias + false, sizeof(TAccum), this->uses_premultiply(), // Don't pack the bias this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool { return this->get_kernel_packing_point(idx, x, y); } @@ -115,7 +115,7 @@ class GenericDepthfirstStrategy : public DepthwiseDepthfirstStrategyCommonget_kernel_rows(), this->get_kernel_cols(), sizeof(TWeight), - false, sizeof(TAccum), // Don't pack the bias + false, sizeof(TAccum), this->uses_premultiply(), // Don't pack the bias this->get_vl_type(), sizeof(TAccum), this->get_accumulator_depth_vl(), [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool { return this->get_kernel_packing_point(idx, x, y); } @@ -208,6 +208,7 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon, GenericInputArrayElement, InputBufferElement, + IntermediateBufferElement, ActivationsElement >; using WorkingSpace = typename WorkspaceManager::WorkspaceType; @@ -232,21 +233,38 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommonget_output_stage(), m_bias); } - size_t get_working_size_per_thread(const unsigned int n_input_channels) const override + size_t get_working_size_per_thread() const override { DepthwiseArgs args(this->m_args); - args.input_channels = n_input_channels; return WorkspaceManager::get_sizeof_workspace(WorkspaceArgs(this->m_strat.get(), args, this->get_output_stage())); } - void initialise_working_space(void *buffer, unsigned int n_input_channels) const override + void initialise_working_space(void *buffer) const override { DepthwiseArgs args(this->m_args); - args.input_channels = n_input_channels; return WorkspaceManager::initialise(buffer, WorkspaceArgs(this->m_strat.get(), args, this->get_output_stage())); } protected: + void fill_inptr_array(const DepthwiseArgs &args, + const TensorSpec &input, + const TInput **inptr_array, TInput *input_buffer, + const unsigned int input_i, const unsigned int input_j, + const unsigned int input_pad_top, const unsigned int input_pad_left) const override + { + fill_pointer_array_generic_kernel( + inptr_array, + this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), + args.kernel_rows, args.kernel_cols, + args.stride_rows, args.stride_cols, + input.base, + input.ld_row, input.ld_col, + input_buffer, + input_pad_top, args.input_rows - input_i, + input_pad_left, args.input_cols - input_j + ); + } + void compute_tile_padded( const DepthwiseArgs &args, unsigned int output_i, unsigned int output_j, @@ -268,17 +286,10 @@ class DepthwiseDepthfirstGeneric : public DepthwiseDepthfirstCommon(ij < 0 ? -ij : 0); const auto input_j = static_cast(ij < 0 ? 0 : ij); - fill_pointer_array_generic_kernel( - ws->inptr_array, - this->m_strat->get_output_rows(), this->m_strat->get_output_cols(), - args.kernel_rows, args.kernel_cols, - args.stride_rows, args.stride_cols, - input.base + input_i*input.ld_row + input_j*input.ld_col + channel_start, - input.ld_row, input.ld_col, - ws->input_buffer, - input_pad_top, args.input_rows - input_i, - input_pad_left, args.input_cols - input_j - ); + Tile multiplied_input; + this->initialise_inptr_array(args, channel_start, channel_end, input, + ws->inptr_array, ws->input_buffer, ws->intermediate_buffer, + input_i, input_j, input_pad_top, input_pad_left, multiplied_input); // Compute the output pointer array fill_pointer_array( -- cgit v1.2.1