aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp')
-rw-r--r--src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp23
1 files changed, 15 insertions, 8 deletions
diff --git a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
index b6f45c6825..592ee72820 100644
--- a/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
+++ b/src/core/NEON/kernels/arm_conv/depthwise/depthfirst_driver.hpp
@@ -72,10 +72,10 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
std::unique_ptr<const IDepthfirstStrategy> m_strat;
/* Compute the amount of working space required for a single thread. */
- virtual size_t get_working_size_per_thread(unsigned int n_input_channels) const = 0;
+ virtual size_t get_working_size_per_thread() const = 0;
/* Initialise the working space for a thread. */
- virtual void initialise_working_space(void *, unsigned int n_input_channels) const = 0;
+ virtual void initialise_working_space(void *) const = 0;
/* Compute a portion of the output tensor with padding. */
virtual void compute_tile_padded(
@@ -164,8 +164,8 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
{
// Get and initialise the working space for this thread.
void *thread_working_space =
- static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread(args.input_channels);
- this->initialise_working_space(thread_working_space, args.input_channels);
+ static_cast<uint8_t *>(working_space) + thread_id * this->get_working_size_per_thread();
+ this->initialise_working_space(thread_working_space);
// Construct convenient representations of the input/output tensors.
TensorSpec<const TInput *> input_tensor(reinterpret_cast<const TInput *>(input), ld_input_row, ld_input_col);
@@ -189,7 +189,9 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
const bool pad_input_top = start_input_i < 0;
const int end_input_i = start_input_i + m_strat->get_input_rows();
const bool pad_input_bottom = static_cast<int>(args.input_rows) < end_input_i;
- const bool pad_row = pad_input_top || pad_input_bottom || pad_output_bottom;
+ // We only need to account for input padding if direct padding is not supported.
+ const bool pad_row = ((pad_input_top || pad_input_bottom) && !this->supports_direct_padding())
+ || pad_output_bottom;
// Iterate over the columns of the output tensor; we attempt to grab as
// much as possible of the unpadded regions, so the loop structure is a
@@ -202,7 +204,7 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
// Determine if we can process a number of unpadded tiles in one go.
int n_unpadded_tiles = 0;
- if (!pad_input_left)
+ if ((!pad_input_left) || this->supports_direct_padding())
{
// Determine the maximum number of tiles we could handle.
n_unpadded_tiles = (args.output_cols - start_output_j) / m_strat->get_output_cols();
@@ -273,9 +275,14 @@ class DepthfirstDriver : public DepthwiseCommon<TInput, TWeight, TOutput>
{
}
- size_t get_working_size(unsigned int n_threads, unsigned int n_input_channels) const override final
+ size_t get_working_size(unsigned int n_threads) const override final
{
- return n_threads * this->get_working_size_per_thread(n_input_channels);
+ return n_threads * this->get_working_size_per_thread();
+ }
+
+ virtual bool supports_direct_padding() const
+ {
+ return false;
}
};