aboutsummaryrefslogtreecommitdiff
path: root/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2018-08-13 15:49:49 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:54:54 +0000
commit26b22160c00d9955255015d82203c7e16f28f0c3 (patch)
tree8646becfe3133ac376d7fc534abe95cbaeb76719 /src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
parentbc92a34f9a96da8fa52fd3563e10f0d1bdd7f3fe (diff)
downloadComputeLibrary-26b22160c00d9955255015d82203c7e16f28f0c3.tar.gz
COMPMID-1480 Add support for NHWC QASYMM8/FP32(non-optimized) to NEON DepthwiseConvolution
Change-Id: I751f5d3fb74085d2e67f610ecf52da4736d0cfb5 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/143870 Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Tested-by: Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp')
-rw-r--r--src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp275
1 files changed, 199 insertions, 76 deletions
diff --git a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
index 3b54ed62c7..d1727fc878 100644
--- a/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDepthwiseConvolutionLayer.cpp
@@ -36,8 +36,8 @@ using namespace arm_compute::misc;
using namespace arm_compute::misc::shape_calculator;
NEDepthwiseConvolutionLayer3x3::NEDepthwiseConvolutionLayer3x3()
- : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _input_nhwc(), _weights_hwio(), _output_nhwc(), _has_bias(false),
- _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true)
+ : _dwc_kernel(), _output_stage_kernel(), _border_handler(), _permute_input(), _permute_weights(), _permute_output(), _accumulator(), _permuted_input(), _permuted_weights(), _permuted_output(),
+ _has_bias(false), _is_quantized(false), _is_optimized(false), _are_weights_reshaped(false), _is_nchw(true), _is_first_run(true), _permute(false)
{
}
@@ -57,29 +57,31 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
input->info()->data_layout());
_are_weights_reshaped = false;
_is_nchw = input->info()->data_layout() == DataLayout::NCHW;
-
- ARM_COMPUTE_ERROR_ON(!_is_optimized && !_is_nchw);
+ _permute = _is_optimized == _is_nchw;
if(_is_optimized)
{
if(_is_nchw)
{
// Configure the function to transform the input tensor from NCHW -> NHWC
- _permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
+ _permute_input.configure(input, &_permuted_input, PermutationVector(2U, 0U, 1U));
+ _permuted_input.info()->set_data_layout(DataLayout::NHWC);
// Configure the function to transform the weights tensor from IHW -> HWI
- _permute_weights.configure(weights, &_weights_hwio, PermutationVector(2U, 0U, 1U));
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(2U, 0U, 1U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NHWC);
// Configure optimized depthwise
- _dwc_kernel.configure(&_input_nhwc, &_weights_hwio, &_output_nhwc, conv_info, depth_multiplier, DataLayout::NHWC);
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, &_permuted_output, conv_info, depth_multiplier, DataLayout::NHWC);
// Configure the function to transform the convoluted output to ACL's native ordering format NCHW
- _permute_output.configure(&_output_nhwc, output, PermutationVector(1U, 2U, 0U));
+ _permute_output.configure(&_permuted_output, output, PermutationVector(1U, 2U, 0U));
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
// Allocate tensors
- _input_nhwc.allocator()->allocate();
- _weights_hwio.allocator()->allocate();
- _output_nhwc.allocator()->allocate();
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
}
else
{
@@ -91,36 +93,70 @@ void NEDepthwiseConvolutionLayer3x3::configure(ITensor *input, const ITensor *we
// Allocate the intermediate accumulator tensor in case of quantized input
if(_is_quantized)
{
- _accumulator.allocator()->init(TensorInfo(output->info()->tensor_shape(), 1, DataType::S32));
+ TensorShape accum_shape = output->info()->tensor_shape();
+
+ if(!_is_nchw)
+ {
+ permute(accum_shape, PermutationVector(1U, 2U, 0U));
+ }
+
+ _accumulator.allocator()->init(TensorInfo(accum_shape, 1, DataType::S32));
_accumulator.info()->set_quantization_info(input->info()->quantization_info());
zero_value = PixelValue(static_cast<uint32_t>(input->info()->quantization_info().offset));
}
- // Configure depthwise convolution kernel
- _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
+ if(!_is_nchw)
+ {
+ // Configure the function to transform the input tensor from NHWC -> NCHW
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
- // Configure border handler
- _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
- }
+ // Configure the function to transform the weights tensor from HWI -> IHW
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
- // Configure biases accumulation
- if(_has_bias || _is_quantized)
- {
- if(_is_quantized)
- {
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ // Configure optimized depthwise
+ _dwc_kernel.configure(&_permuted_input, &_permuted_weights, (_is_quantized) ? &_accumulator : &_permuted_output, conv_info, depth_multiplier);
+
+ // Configure border handler
+ _border_handler.configure(&_permuted_input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
- float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
- int output_multiplier, output_shift;
- quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_accumulator, biases, output, output_multiplier, output_shift, output_quant_info.offset);
- _accumulator.allocator()->allocate();
+ // Allocate tensors
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
}
else
{
- _output_stage_kernel.configure(output, biases);
+ // Configure depthwise convolution kernel
+ _dwc_kernel.configure(input, weights, (_is_quantized) ? &_accumulator : output, conv_info, depth_multiplier);
+
+ // Configure border handler
+ _border_handler.configure(input, _dwc_kernel.border_size(), BorderMode::CONSTANT, zero_value);
}
}
+
+ // Configure biases accumulation
+ if(_is_quantized)
+ {
+ const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+
+ float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
+ int output_multiplier, output_shift;
+ quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+ _output_stage_kernel.configure(&_accumulator, biases, _is_nchw ? output : &_permuted_output, output_multiplier, output_shift, output_quant_info.offset);
+ _accumulator.allocator()->allocate();
+ }
+ else if(_has_bias)
+ {
+ _output_stage_kernel.configure((_is_nchw || _is_optimized) ? output : &_permuted_output, biases);
+ }
+
+ if(!_is_optimized && !_is_nchw)
+ {
+ // Configure the function to transform the convoluted output to NHWC
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+ _permuted_output.allocator()->allocate();
+ }
}
Status NEDepthwiseConvolutionLayer3x3::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
@@ -142,32 +178,29 @@ void NEDepthwiseConvolutionLayer3x3::run()
_dwc_kernel.generate_convolver();
}
- // Permute weights in HWIO format if the optimized kernel will be executedd
- if(!_are_weights_reshaped && _is_optimized && _is_nchw)
- {
- _are_weights_reshaped = true;
- _permute_weights.run();
- }
-
- // Handle input
- if(_is_optimized)
+ // Permute weights
+ if(_permute)
{
- if(_is_nchw)
+ if(!_are_weights_reshaped)
{
- // Permute input to NHWC format execution
- _permute_input.run();
+ _are_weights_reshaped = true;
+ _permute_weights.run();
}
+
+ _permute_input.run();
}
- else
+
+ // Handle input
+ if(!_is_optimized)
{
- // Fill border in NCHW format execution
+ // Fill border
NEScheduler::get().schedule(&_border_handler, Window::DimX);
}
// Execute depthwise convolution
NEScheduler::get().schedule(&_dwc_kernel, Window::DimX);
- // Permute output to ACL's native NCHW format in case of NHWC execution
+ // Permute output
if(_is_optimized && _is_nchw)
{
_permute_output.run();
@@ -178,27 +211,54 @@ void NEDepthwiseConvolutionLayer3x3::run()
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
}
+
+ // Permute output
+ if(!_is_optimized && !_is_nchw)
+ {
+ _permute_output.run();
+ }
}
NEDepthwiseConvolutionLayer::NEDepthwiseConvolutionLayer()
- : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _input_reshaped(),
- _weights_reshaped(), _v2mm_output(), _output_reshaped(), _is_prepared(false), _is_quantized(false), _original_weights(nullptr)
+ : _im2col_kernel(), _weights_reshape_kernel(), _v2mm_kernel(), _vector_to_tensor_kernel(), _output_stage_kernel(), _v2mm_input_fill_border(), _v2mm_weights_fill_border(), _permute_input(),
+ _permute_weights(), _permute_output(), _input_reshaped(), _weights_reshaped(), _v2mm_output(), _output_reshaped(), _permuted_input(), _permuted_weights(), _permuted_output(), _is_prepared(false),
+ _is_quantized(false), _is_nhwc(false), _original_weights(nullptr)
{
}
void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, unsigned int depth_multiplier)
{
+ const unsigned int channel_idx = get_data_layout_dimension_index(input->info()->data_layout(), DataLayoutDimension::CHANNEL);
+ ARM_COMPUTE_UNUSED(channel_idx);
+
ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::F32);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights);
- ARM_COMPUTE_ERROR_ON((input->info()->dimension(2) * depth_multiplier) != weights->info()->dimension(2));
+ ARM_COMPUTE_ERROR_ON((input->info()->dimension(channel_idx) * depth_multiplier) != weights->info()->dimension(channel_idx));
+
+ _is_nhwc = input->info()->data_layout() == DataLayout::NHWC;
+
+ ITensor *input_to_use = input;
+ const ITensor *weights_to_use = weights;
+ ITensor *output_to_use = output;
+
+ if(_is_nhwc)
+ {
+ _permute_input.configure(input, &_permuted_input, PermutationVector(1U, 2U, 0U));
+ _permuted_input.info()->set_data_layout(DataLayout::NCHW);
+ input_to_use = &_permuted_input;
+
+ _permute_weights.configure(weights, &_permuted_weights, PermutationVector(1U, 2U, 0U));
+ _permuted_weights.info()->set_data_layout(DataLayout::NCHW);
+ weights_to_use = &_permuted_weights;
+ }
- const size_t weights_w = weights->info()->dimension(0);
- const size_t weights_h = weights->info()->dimension(1);
- const size_t weights_z = weights->info()->dimension(2);
+ const size_t weights_w = weights_to_use->info()->dimension(0);
+ const size_t weights_h = weights_to_use->info()->dimension(1);
+ const size_t weights_z = weights_to_use->info()->dimension(2);
_is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
_is_prepared = false;
- _original_weights = weights;
+ _original_weights = weights_to_use;
// Should bias be appended ?
bool append_bias = (biases != nullptr) && !_is_quantized;
@@ -210,6 +270,14 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape));
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
+ if(_is_nhwc)
+ {
+ permute(output_shape, PermutationVector(1U, 2U, 0U));
+ _permuted_output.allocator()->init(output->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
+ _permuted_output.info()->set_data_layout(DataLayout::NCHW);
+ output_to_use = &_permuted_output;
+ }
+
// Output width and height
const unsigned int conv_w = output_shape.x();
const unsigned int conv_h = output_shape.y();
@@ -219,41 +287,50 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
const size_t conv_size = conv_w * conv_h;
// Im2Col configuration
- TensorShape shape_im2col = input->info()->tensor_shape();
+ TensorShape shape_im2col = input_to_use->info()->tensor_shape();
shape_im2col.set(0, patch_size);
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
- _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- _im2col_kernel.configure(input, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
+ _input_reshaped.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+ _im2col_kernel.configure(input_to_use, &_input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier);
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
- _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- _weights_reshape_kernel.configure(weights, &_weights_reshaped, append_bias ? biases : nullptr);
+ _weights_reshaped.allocator()->init(weights->info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
+ _weights_reshape_kernel.configure(weights_to_use, &_weights_reshaped, append_bias ? biases : nullptr);
// GEMV configuration
DataType v2mm_dt = (input->info()->data_type() == DataType::QASYMM8) ? DataType::S32 : input->info()->data_type();
- TensorShape shape_v2mm_out = input->info()->tensor_shape();
+ TensorShape shape_v2mm_out = input_to_use->info()->tensor_shape();
shape_v2mm_out.set(0, conv_size * weights_z);
shape_v2mm_out.set(1, 1);
shape_v2mm_out.set(2, 1);
- _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ _v2mm_output.allocator()->init(input->info()->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
_v2mm_kernel.configure(&_input_reshaped, &_weights_reshaped, &_v2mm_output);
_output_reshaped.allocator()->init(_v2mm_output.info()->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output, conv_w, conv_h);
+ _vector_to_tensor_kernel.configure(&_v2mm_output, (_is_quantized) ? &_output_reshaped : output_to_use, conv_w, conv_h);
// Output staged configuration
if(_is_quantized)
{
- const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
+ const QuantizationInfo output_quant_info = output->info()->quantization_info();
float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
int output_multiplier, output_shift;
quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
- _output_stage_kernel.configure(&_output_reshaped, biases, output, output_multiplier, output_shift, output_quant_info.offset);
+ _output_stage_kernel.configure(&_output_reshaped, biases, output_to_use, output_multiplier, output_shift, output_quant_info.offset);
_output_reshaped.allocator()->allocate();
}
+ if(_is_nhwc)
+ {
+ _permute_output.configure(&_permuted_output, output, PermutationVector(2U, 0U, 1U));
+
+ _permuted_input.allocator()->allocate();
+ _permuted_weights.allocator()->allocate();
+ _permuted_output.allocator()->allocate();
+ }
+
// Fill borders on inputs
PixelValue zero_in(static_cast<int32_t>(0));
PixelValue zero_w(static_cast<int32_t>(0));
@@ -273,53 +350,84 @@ void NEDepthwiseConvolutionLayer::configure(ITensor *input, const ITensor *weigh
_v2mm_output.allocator()->allocate();
}
-Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+Status NEDepthwiseConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &conv_info,
unsigned int depth_multiplier)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() != DataLayout::NCHW && input->data_layout() != DataLayout::NHWC);
+ const ITensorInfo *input_to_use = input;
+ const ITensorInfo *weights_to_use = weights;
+ const ITensorInfo *output_to_use = output;
+
+ TensorShape permuted_input_shape = input->tensor_shape();
+ TensorShape permuted_weights_shape = weights->tensor_shape();
+ TensorInfo permuted_input;
+ TensorInfo permuted_weights;
+
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ permute(permuted_input_shape, PermutationVector(1U, 2U, 0U));
+ permute(permuted_weights_shape, PermutationVector(1U, 2U, 0U));
+
+ permuted_input = TensorInfo(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_input_shape).set_data_layout(DataLayout::NCHW));
+ permuted_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(permuted_weights_shape).set_data_layout(DataLayout::NCHW));
+
+ input_to_use = &permuted_input;
+ weights_to_use = &permuted_weights;
+ }
+
const bool is_quantized = is_data_type_quantized_asymmetric(input->data_type());
const bool append_bias = (biases != nullptr) && !is_quantized;
- const TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
- const size_t weights_w = weights->dimension(0);
- const size_t weights_h = weights->dimension(1);
- const size_t weights_z = weights->dimension(2);
+ TensorShape output_shape = shape_calculator::compute_depthwise_convolution_shape(*input, *weights, conv_info, depth_multiplier);
+ const size_t weights_w = weights_to_use->dimension(0);
+ const size_t weights_h = weights_to_use->dimension(1);
+ const size_t weights_z = weights_to_use->dimension(2);
const unsigned int conv_w = output_shape.x();
const unsigned int conv_h = output_shape.y();
const size_t patch_size = weights_w * weights_h + (append_bias ? 1 : 0);
const size_t conv_size = conv_w * conv_h;
+ // Output auto inizialitation if not yet initialized
+ auto_init_if_empty(*output, input->clone()->set_tensor_shape(output_shape));
ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape);
+ TensorInfo permuted_output;
+ if(input->data_layout() == DataLayout::NHWC)
+ {
+ permute(output_shape, PermutationVector(1U, 2U, 0U));
+ permuted_output = TensorInfo(output->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape).set_data_layout(DataLayout::NCHW));
+ output_to_use = &permuted_output;
+ }
+
// Im2Col configuration
- TensorShape shape_im2col = input->tensor_shape();
+ TensorShape shape_im2col = input_to_use->tensor_shape();
shape_im2col.set(0, patch_size);
shape_im2col.set(1, conv_size);
shape_im2col.set(2, weights_z);
- TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
+ TensorInfo input_reshaped(input->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_im2col).set_data_layout(DataLayout::NCHW));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseIm2ColKernel::validate(input_to_use, &input_reshaped, Size2D(weights_w, weights_h), conv_info, append_bias, depth_multiplier));
// Weights reshape configuration
const TensorShape shape_weights_reshape(patch_size, weights_z);
- TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights, &weights_reshaped, append_bias ? biases : nullptr));
+ TensorInfo weights_reshaped(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(shape_weights_reshape).set_data_layout(DataLayout::NCHW));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseWeightsReshapeKernel::validate(weights_to_use, &weights_reshaped, append_bias ? biases : nullptr));
// GEMV configuration
DataType v2mm_dt = (input->data_type() == DataType::QASYMM8) ? DataType::S32 : input->data_type();
- TensorShape shape_v2mm_out = input->tensor_shape();
+ TensorShape shape_v2mm_out = input_to_use->tensor_shape();
shape_v2mm_out.set(0, conv_size * weights_z);
shape_v2mm_out.set(1, 1);
shape_v2mm_out.set(2, 1);
- TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out));
+ TensorInfo v2mm_output(input->clone()->set_is_resizable(true).reset_padding().set_data_type(v2mm_dt).set_tensor_shape(shape_v2mm_out).set_data_layout(DataLayout::NCHW));
ARM_COMPUTE_RETURN_ON_ERROR(NEGEMMMatrixVectorMultiplyKernel::validate(&input_reshaped, &weights_reshaped, &v2mm_output));
- TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_shape));
- ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output, conv_w, conv_h));
+ TensorInfo output_reshaped(v2mm_output.clone()->set_is_resizable(true).reset_padding().set_tensor_shape(output_to_use->tensor_shape()));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDepthwiseVectorToTensorKernel::validate(&v2mm_output, (is_quantized) ? &output_reshaped : output_to_use, conv_w, conv_h));
if(is_quantized)
{
- ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output));
+ ARM_COMPUTE_RETURN_ON_ERROR(NEDirectConvolutionLayerOutputStageKernel::validate(&output_reshaped, biases, output_to_use));
}
return Status{};
@@ -329,6 +437,11 @@ void NEDepthwiseConvolutionLayer::run()
{
prepare();
+ if(_is_nhwc)
+ {
+ _permute_input.run();
+ }
+
NEScheduler::get().schedule(&_im2col_kernel, Window::DimX);
NEScheduler::get().schedule(&_v2mm_input_fill_border, Window::DimX);
NEScheduler::get().schedule(&_v2mm_kernel, Window::DimX);
@@ -337,6 +450,11 @@ void NEDepthwiseConvolutionLayer::run()
{
NEScheduler::get().schedule(&_output_stage_kernel, Window::DimX);
}
+
+ if(_is_nhwc)
+ {
+ _permute_output.run();
+ }
}
void NEDepthwiseConvolutionLayer::prepare()
@@ -345,6 +463,11 @@ void NEDepthwiseConvolutionLayer::prepare()
{
ARM_COMPUTE_ERROR_ON(!_original_weights->is_used());
+ if(_is_nhwc)
+ {
+ _permute_weights.run();
+ }
+
// Run reshape and mark original weights as unused
_weights_reshaped.allocator()->allocate();
NEScheduler::get().schedule(&_weights_reshape_kernel, Window::DimX);