aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp')
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp408
1 files changed, 408 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
new file mode 100644
index 0000000000..c3daaf04fe
--- /dev/null
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthwise_planar.hpp
@@ -0,0 +1,408 @@
+/*
+ * Copyright (c) 2022-2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+
+#pragma once
+
+#include "depthfirst_driver.hpp"
+#include "interleaves/generic.hpp"
+
+namespace arm_conv {
+namespace depthwise {
+
+template <typename OutputStage>
+class IPlanarStrategy
+{
+ public:
+ virtual ~IPlanarStrategy() = default;
+ virtual unsigned int get_output_rows(void) const = 0;
+ virtual arm_gemm::VLType get_vl_type(void) const = 0;
+
+ virtual size_t get_storage_size(const DepthwiseArgs &) const = 0;
+ virtual void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const = 0;
+};
+
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum,
+ typename OutputStage>
+struct PlanarKernelType;
+
+template <typename TInput, typename TWeight, typename TOutput, typename TAccum>
+struct PlanarKernelType<TInput, TWeight, TOutput, TAccum, Nothing>
+{
+ typedef void (*Type)(
+ const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *, const TAccum *,
+ TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+ unsigned int start_channels, unsigned int valid_channels,
+ TAccum act_min, TAccum act_max
+ );
+
+ template <typename WorkspaceType>
+ static inline void execute(
+ const Type fn,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const TAccum *bias,
+ TOutput **outptrs, const size_t *outlds, const size_t *outvllds, unsigned int output_cols,
+ unsigned int start_channel, unsigned int valid_channels,
+ const Nothing &, const WorkspaceType *ws
+ )
+ {
+ fn(
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows,
+ pad_left, valid_input_cols,
+ weights, bias,
+ outptrs, outlds, outvllds, output_cols,
+ start_channel, valid_channels,
+ ws->activation_min, ws->activation_max
+ );
+ }
+};
+
+template <typename TInput, typename TWeight, typename TOutput>
+struct PlanarKernelType<TInput, TWeight, TOutput, int32_t, arm_gemm::Requantize32>
+{
+ typedef void (*Type)(
+ const TInput *, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *,
+ TOutput **, const size_t *, const size_t *, unsigned int output_cols,
+ unsigned int start_channel, unsigned int valid_channels,
+ const arm_gemm::Requantize32 &
+ );
+
+ template <typename WorkspaceType>
+ static inline void execute(
+ const Type fn,
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const int32_t *,
+ TOutput **outptrs, const size_t *outlds, const size_t *outldvls, unsigned int output_cols,
+ unsigned int first_channel, unsigned int valid_channels,
+ const arm_gemm::Requantize32 &qp, const WorkspaceType *
+ )
+ {
+ fn(
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows,
+ pad_left, valid_input_cols,
+ weights,
+ outptrs, outlds, outldvls, output_cols,
+ first_channel, valid_channels,
+ qp
+ );
+ }
+};
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TOutput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class PlanarStrategy : public IPlanarStrategy<OutputStage>
+{
+ unsigned int m_kernel_rows, m_kernel_cols;
+ unsigned int m_stride_rows, m_stride_cols;
+ unsigned int m_output_rows;
+ arm_gemm::VLType m_vl_type;
+
+ protected:
+ virtual bool get_kernel_packing_point(const unsigned int index, unsigned int &x, unsigned int &y) const
+ {
+ // Get the kernel point to pack at the given index; return false to
+ // indicate that this index (and all greater indices) is out of range.
+ if (m_kernel_rows * m_kernel_cols <= index)
+ return false;
+
+ y = index % m_kernel_cols;
+ x = index / m_kernel_cols;
+ return true;
+ }
+
+ virtual interleaves::PackingArguments get_kernel_packing_arguments(void) const
+ {
+ return interleaves::PackingArguments(
+ m_kernel_rows, m_kernel_cols, sizeof(TWeight),
+ false, sizeof(TAccum), true, // Don't pack the bias
+ m_vl_type, sizeof(TAccum), 1, // Accumulator depth of 1 TODO
+ [this] (unsigned int idx, unsigned int &x, unsigned int &y) -> bool
+ { return this->get_kernel_packing_point(idx, x, y); }
+ );
+ }
+
+ public:
+ PlanarStrategy(
+ unsigned int kernel_rows, unsigned int kernel_cols,
+ unsigned int stride_rows, unsigned int stride_cols,
+ unsigned int output_rows,
+ arm_gemm::VLType vl_type
+ ) : m_kernel_rows(kernel_rows), m_kernel_cols(kernel_cols),
+ m_stride_rows(stride_rows), m_stride_cols(stride_cols),
+ m_output_rows(output_rows), m_vl_type(vl_type)
+ {
+ }
+
+ unsigned int get_output_rows(void) const override { return m_output_rows; }
+ arm_gemm::VLType get_vl_type(void) const override { return m_vl_type; }
+
+ size_t get_storage_size(const DepthwiseArgs &args) const override
+ {
+ return interleaves::get_storage_size_generic(this->get_kernel_packing_arguments(), args);
+ }
+
+ void pack_parameters(
+ const DepthwiseArgs &args, void *buffer,
+ const void *biases, const OutputStage &,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) const override
+ {
+ interleaves::pack_parameters_generic(
+ this->get_kernel_packing_arguments(), args,
+ buffer, biases, weights, ld_weight_col, ld_weight_row
+ );
+ }
+
+ using KernelType = typename PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::Type;
+ virtual KernelType get_kernel(void) const = 0;
+};
+
+
+namespace {
+
+template <typename T>
+struct OutputRowPtrsElement
+{
+ struct Workspace
+ {
+ T **output_row_ptrs;
+ size_t *output_ld_cols;
+ size_t *output_ld_vls; // Stride between vectors of channels
+ T *output_padding_buffer;
+ };
+
+ template <typename OutputStage>
+ static size_t get_element_size(const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+ {
+ // We need one pointer and stride for each row of output, and an additional
+ // blob of memory into which padded stores can go.
+ return args.strategy->get_output_rows() * (sizeof(T *) + 2*sizeof(size_t)) +
+ get_vector_length<char>(args.strategy->get_vl_type());
+ }
+
+ template <typename WorkspaceType, typename OutputStage>
+ static void *initialise(WorkspaceType *ws, void *buffer,
+ const WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage> &args)
+ {
+ const auto n_rows = args.strategy->get_output_rows();
+ ws->output_row_ptrs = reinterpret_cast<T **>(buffer);
+ ws->output_ld_cols = reinterpret_cast<size_t *>(ws->output_row_ptrs + n_rows);
+ ws->output_ld_vls = ws->output_ld_cols + n_rows;
+ ws->output_padding_buffer = reinterpret_cast<T *>(ws->output_ld_vls + n_rows);
+ return ws->output_padding_buffer + get_vector_length<T>(args.strategy->get_vl_type());
+ }
+};
+
+} // namespace {anonymous}
+
+
+template <typename TInput, typename TWeight=TInput, typename TOutput=TInput,
+ typename TAccum=typename DefaultTAccum<TOutput>::Type,
+ typename OutputStage=typename DefaultOutputStage<TOutput>::Type>
+class DepthwisePlanar : public DepthwiseCommon<TInput, TWeight, TOutput>
+{
+ using Parent = DepthwiseCommon<TInput, TWeight, TOutput>;
+ using StrategyType = IPlanarStrategy<OutputStage>;
+ using WorkspaceManager = Workspace<
+ OutputRowPtrsElement<TOutput>,
+ ActivationsElement<TAccum, OutputStage>
+ >;
+ using WorkspaceType = typename WorkspaceManager::WorkspaceType;
+
+ std::unique_ptr<StrategyType> m_strat;
+ const TAccum *m_bias;
+ OutputStage m_os;
+
+ public:
+ DepthwisePlanar(StrategyType *const strat, const DepthwiseArgs &args, const OutputStage &os = {})
+ : Parent(args), m_strat(strat), m_bias(nullptr), m_os(os)
+ {
+ }
+
+ DepthwisePlanar(DepthwisePlanar &) = delete;
+ DepthwisePlanar &operator=(DepthwisePlanar &) = delete;
+
+ size_t get_storage_size(void) const override
+ {
+ return m_strat->get_storage_size(this->m_args);
+ }
+
+ void pack_parameters(
+ void *buffer, const void *biases,
+ const void *weights, size_t ld_weight_col, size_t ld_weight_row
+ ) override
+ {
+ m_strat->pack_parameters(this->m_args, buffer, biases, {}, weights, ld_weight_col, ld_weight_row);
+ this->m_bias = reinterpret_cast<const TAccum *>(biases);
+ depthwise_depthfirst::stash_bias(this->m_os, biases);
+ }
+
+ size_t get_working_size(unsigned int n_threads) const override
+ {
+ return this->get_working_size_per_thread() * n_threads;
+ }
+
+ protected:
+ /* Compute the amount of working space required for a single thread. */
+ virtual size_t get_working_size_per_thread(void) const
+ {
+ return WorkspaceManager::get_sizeof_workspace(
+ WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os));
+ }
+
+ /* Initialise the working space for a thread. */
+ virtual void initialise_working_space(void *buffer) const
+ {
+ WorkspaceManager::initialise(
+ buffer,
+ WorkspaceArgs<IPlanarStrategy<OutputStage>, OutputStage>(m_strat.get(), this->m_args, m_os)
+ );
+ }
+
+ /* Execute the kernel for a given chunk of work. */
+ virtual void execute_kernel(
+ const TInput *inptr, size_t ld_in_row, size_t ld_in_col, size_t ld_in_vl,
+ unsigned int pad_top, unsigned int valid_input_rows,
+ unsigned int pad_left, unsigned int valid_input_cols,
+ const TWeight *weights, const TAccum *bias,
+ TOutput *outptr, size_t ld_out_row, size_t ld_out_col, size_t ld_out_vl,
+ unsigned int valid_output_rows, unsigned int valid_output_cols,
+ unsigned int first_channel, unsigned int valid_channels,
+ WorkspaceType *ws
+ ) const
+ {
+ // Initialise the output pointers
+ for (auto i = 0u; i < m_strat->get_output_rows(); i++)
+ {
+ // Point at the output tensor for all valid rows; otherwise point at the
+ // padding buffer.
+ ws->output_row_ptrs[i] = i < valid_output_rows ? outptr : ws->output_padding_buffer;
+ ws->output_ld_cols[i] = i < valid_output_rows ? ld_out_col : 0;
+ ws->output_ld_vls[i] = i < valid_output_rows ? ld_out_vl : 0;
+ outptr += ld_out_row;
+ }
+
+ // Execute the kernel
+ PlanarKernelType<TInput, TWeight, TOutput, TAccum, OutputStage>::template execute<WorkspaceType>(
+ reinterpret_cast<const PlanarStrategy<TInput, TWeight, TOutput, TAccum, OutputStage> *>(m_strat.get())->get_kernel(),
+ inptr, ld_in_row, ld_in_col, ld_in_vl,
+ pad_top, valid_input_rows, pad_left, valid_input_cols,
+ weights, bias,
+ ws->output_row_ptrs, ws->output_ld_cols, ws->output_ld_vls,
+ valid_output_cols, first_channel, valid_channels,
+ this->m_os, ws
+ );
+ }
+
+ void execute_internal(
+ const DepthwiseArgs &args,
+ const void *input,
+ size_t ld_input_col,
+ size_t ld_input_row,
+ size_t ld_input_batch,
+ const void *parameters,
+ void *output,
+ size_t ld_output_col,
+ size_t ld_output_row,
+ size_t ld_output_batch,
+ void *working_space,
+ unsigned int thread_id,
+ unsigned int n_threads
+ ) const override
+ {
+ // Get and initialise the working space for this thread.
+ void *thread_working_space =
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+ this->initialise_working_space(thread_working_space);
+ auto ws = reinterpret_cast<WorkspaceType *>(thread_working_space);
+
+ const auto n_output_channels = args.input_channels * args.channel_multiplier;
+ const auto vl = get_vector_length<TAccum>(m_strat->get_vl_type());
+
+ // Get typed pointers
+ auto input_batch = reinterpret_cast<const TInput *>(input);
+ auto output_batch = reinterpret_cast<TOutput *>(output);
+ auto weights = reinterpret_cast<const TWeight *>(parameters);
+
+ // Iterate over batches
+ for (auto batches = args.n_batches; batches; batches--)
+ {
+ // NOTE: Other loop orderings are possible and it would be worth
+ // investigating them.
+
+ // Within a batch, stripe threads across rows.
+ for (auto start_output_i = thread_id * m_strat->get_output_rows();
+ start_output_i < args.output_rows;
+ start_output_i += n_threads * m_strat->get_output_rows())
+ {
+ // Determine what (if any padding) is required on the top/bottom of
+ // this row of the convolution.
+ const int start_input_i = start_output_i * args.stride_rows - args.padding.top;
+ const unsigned int input_pad_top = start_input_i < 0 ? -start_input_i : 0;
+ const unsigned int input_i = start_input_i < 0 ? 0 : start_input_i;
+ const unsigned int valid_input_rows = input_i > args.input_rows ? 0 : args.input_rows - input_i;
+ const unsigned int valid_output_rows = args.output_rows - start_output_i;
+
+ auto inptr_row = input_batch + input_i*ld_input_row;
+ auto outptr_row = output_batch + start_output_i * ld_output_row;
+
+ // Execute the kernel
+ this->execute_kernel(
+ inptr_row, ld_input_row, ld_input_col, vl,
+ input_pad_top, valid_input_rows, args.padding.left, args.input_cols,
+ weights, this->m_bias,
+ outptr_row, ld_output_row, ld_output_col, vl,
+ valid_output_rows, args.output_cols,
+ 0 /* first channel */, n_output_channels,
+ ws
+ );
+ }
+
+ // Update the input and output pointers to account for batch
+ input_batch += ld_input_batch;
+ output_batch += ld_output_batch;
+ }
+ }
+};
+
+} // namespace depthwise
+} // namespace arm_conv