aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
diff options
context:
space:
mode:
authorMichalis Spyrou <michalis.spyrou@arm.com>2020-09-28 23:28:08 +0100
committerMichalis Spyrou <michalis.spyrou@arm.com>2020-09-29 10:32:38 +0000
commit017ead22de327b1e46882e423558e5698d8da90e (patch)
tree9f84533935df3b465d59e964243cfce1cf7421c7 /src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
parentd175ecea20f5c9e3bff07f7c5d3e4bb5519f7cb9 (diff)
downloadComputeLibrary-017ead22de327b1e46882e423558e5698d8da90e.tar.gz
COMPMID-3174: Remove padding from NEDirectConvolutionLayerOutputStageKernel
Signed-off-by: Michalis Spyrou <michalis.spyrou@arm.com> Change-Id: I3c5cfe50e9cee30b66f4094da105d383c077aaf9 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4044 Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp359
1 files changed, 208 insertions, 151 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
index 2814c67f70..c2c85f81ef 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerOutputStageKernel.cpp
@@ -86,78 +86,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
return Status{};
}
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output,
- const DirectConvolutionLayerOutputStageKernelInfo &info)
-{
- ARM_COMPUTE_ERROR_ON(input->data_layout() == DataLayout::UNKNOWN);
-
- const DataType data_type = input->data_type();
-
- // Auto-initialize output output if required
- if(output != nullptr)
- {
- // Work out expected output data type
- const DataType output_dt = (data_type == DataType::S32) ? info.output_data_type : data_type;
- // Output tensor auto initialization if not yet initialized
- auto_init_if_empty(*output, input->clone()->set_data_type(output_dt));
- }
-
- bool window_changed = false;
- unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(data_type);
-
- // Update processed elements when input is S32 (comes from quantization input)
- if(data_type == DataType::S32)
- {
- num_elems_processed_per_iteration = 16;
- }
-
- // Configure kernel window
- Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
- AccessWindowHorizontal input_access(input, 0, num_elems_processed_per_iteration);
-
- if(output != nullptr && (output->total_size() != 0))
- {
- AccessWindowHorizontal output_access(output, 0, num_elems_processed_per_iteration);
-
- if(bias == nullptr)
- {
- window_changed = update_window_and_padding(win, input_access, output_access);
- }
- else
- {
- AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
- window_changed = update_window_and_padding(win, input_access, output_access, bias_access);
- }
-
- output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape()));
- }
- else
- {
- if(bias == nullptr)
- {
- window_changed = update_window_and_padding(win, input_access);
- }
- else
- {
- if(input->data_layout() == DataLayout::NCHW)
- {
- AccessWindowStatic bias_access(bias, 0, 0, bias->dimension(0), bias->dimension(1));
- window_changed = update_window_and_padding(win, input_access, bias_access);
- }
- else
- {
- AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration);
- window_changed = update_window_and_padding(win, input_access, bias_access);
- }
- }
-
- input_access.set_valid_region(win, ValidRegion(Coordinates(), input->tensor_shape()));
- }
-
- Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{};
- return std::make_pair(err, win);
-}
-
template <typename T, bool has_bias>
typename std::enable_if<arm_compute::utils::traits::is_floating_point<T>::value, void>::type
output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITensor *output,
@@ -171,23 +99,50 @@ output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window, ITe
ARM_COMPUTE_UNUSED(result_shift);
ARM_COMPUTE_UNUSED(result_offset_after_shift);
- Iterator in(input, window);
- Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates & id)
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
+ const int window_step_x = 16 / input->info()->element_size();
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator in(input, win);
+ Iterator out(output, win);
+ execute_window_loop(win, [&](const Coordinates & id)
{
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
- auto v_in = wrapper::vloadq(in_ptr);
+ int x = window_start_x;
+ for(; x < (window_end_x - window_step_x); x += window_step_x)
+ {
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<const T *>(in.ptr()) + x;
+ auto v_in = wrapper::vloadq(in_ptr);
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
+ v_in = wrapper::vadd(v_in, vb);
+ }
+
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr()) + x;
+ wrapper::vstore(out_ptr, v_in);
+ }
- // Accumulate bias
- if(has_bias)
+ // Left-overs loop
+ for(; x < window_end_x; ++x)
{
- const auto vb = wrapper::vdup_n(*reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z()))), ExactTagType{});
- v_in = wrapper::vadd(v_in, vb);
+ // Get bias and pointer to input
+ auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto b = *reinterpret_cast<const T *>(bias->ptr_to_element(Coordinates(id.z())));
+ s_in += b;
+ }
+
+ *(reinterpret_cast<T *>(out.ptr()) + x) = s_in;
}
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- wrapper::vstore(out_ptr, v_in);
},
in, out);
}
@@ -202,29 +157,57 @@ output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window, ITe
ARM_COMPUTE_UNUSED(result_offset_after_shift);
Window window_bias = window;
+ window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
window_bias.set(3, Window::Dimension(0, 0, 0));
- Iterator in(input, window);
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
+ const int window_step_x = 16 / input->info()->element_size();
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator in(input, win);
Iterator bi(bias, window_bias);
- Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates &)
- {
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
- auto v_in = wrapper::vloadq(in_ptr);
+ Iterator out(output, win);
- // Accumulate bias
- if(has_bias)
+ execute_window_loop(win, [&](const Coordinates &)
+ {
+ int x = window_start_x;
+ for(; x < (window_end_x - window_step_x); x += window_step_x)
{
- const auto bias_ptr = reinterpret_cast<T *>(bi.ptr());
- v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<const T *>(in.ptr());
+ auto v_in = wrapper::vloadq(in_ptr + x);
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+ v_in = wrapper::vadd(v_in, wrapper::vloadq(bias_ptr));
+ }
+
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+ wrapper::vstore(out_ptr + x, v_in);
}
- const auto out_ptr = reinterpret_cast<T *>(out.ptr());
- wrapper::vstore(out_ptr, v_in);
+ // Left-overs loop
+ for(; x < window_end_x; ++x)
+ {
+ // Get bias and pointer to input
+ auto s_in = *(reinterpret_cast<const T *>(in.ptr()) + x);
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto bias_ptr = reinterpret_cast<T *>(bi.ptr()) + x;
+ s_in += *bias_ptr;
+ }
+
+ const auto out_ptr = reinterpret_cast<T *>(out.ptr());
+ *(out_ptr + x) = s_in;
+ }
},
in, bi, out);
}
@@ -242,40 +225,70 @@ void output_stage_nchw(ITensor *input, const ITensor *bias, const Window &window
const VectorType min = wrapper::vdup_n(std::numeric_limits<TOut>::lowest(), TagType{});
const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
- Iterator in(input, window);
- Iterator out(output, window);
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
+ const int window_step_x = 16 / input->info()->element_size();
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
- execute_window_loop(window, [&](const Coordinates & id)
+ Iterator in(input, win);
+ Iterator out(output, win);
+
+ execute_window_loop(win, [&](const Coordinates & id)
{
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
- int32x4x4_t v_in =
- {
- {
- wrapper::vloadq(in_ptr),
- wrapper::vloadq(in_ptr + 4),
- wrapper::vloadq(in_ptr + 8),
- wrapper::vloadq(in_ptr + 12)
- }
- };
- // Accumulate bias
- if(has_bias)
+ int x = window_start_x;
+ for(; x < (window_end_x - window_step_x); x += window_step_x)
{
- const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
- v_in =
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+ int32x4x4_t v_in =
{
{
- wrapper::vadd(v_in.val[0], vb),
- wrapper::vadd(v_in.val[1], vb),
- wrapper::vadd(v_in.val[2], vb),
- wrapper::vadd(v_in.val[3], vb)
+ wrapper::vloadq(in_ptr),
+ wrapper::vloadq(in_ptr + 4),
+ wrapper::vloadq(in_ptr + 8),
+ wrapper::vloadq(in_ptr + 12)
}
};
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto vb = wrapper::vdup_n(*reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z()))), TagType{});
+ v_in =
+ {
+ {
+ wrapper::vadd(v_in.val[0], vb),
+ wrapper::vadd(v_in.val[1], vb),
+ wrapper::vadd(v_in.val[2], vb),
+ wrapper::vadd(v_in.val[3], vb)
+ }
+ };
+ }
+
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32,
+ min, max, false));
}
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
- wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+ // Left-overs loop
+ for(; x < window_end_x; ++x)
+ {
+ // Get bias and pointer to input
+ int32_t s_in = *(reinterpret_cast<const int32_t *>(in.ptr()) + x);
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto b = *reinterpret_cast<const int32_t *>(bias->ptr_to_element(Coordinates(id.z())));
+ s_in += b;
+ }
+
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+ std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+ }
},
in, out);
}
@@ -292,41 +305,71 @@ void output_stage_nhwc(ITensor *input, const ITensor *bias, const Window &window
const VectorType max = wrapper::vdup_n(std::numeric_limits<TOut>::max(), TagType{});
Window window_bias = window;
+ window_bias.set(Window::DimX, Window::Dimension(0, 1, 1));
window_bias.set(Window::DimY, Window::Dimension(0, 0, 0));
window_bias.set(Window::DimZ, Window::Dimension(0, 0, 0));
window_bias.set(3, Window::Dimension(0, 0, 0));
- Iterator in(input, window);
+ const int window_start_x = window.x().start();
+ const int window_end_x = window.x().end();
+ const int window_step_x = 16 / input->info()->element_size();
+ Window win = window;
+ win.set(Window::DimX, Window::Dimension(0, 1, 1));
+
+ Iterator in(input, win);
Iterator bi(bias, window_bias);
+ Iterator out(output, win);
- Iterator out(output, window);
- execute_window_loop(window, [&](const Coordinates &)
+ execute_window_loop(win, [&](const Coordinates &)
{
- // Get bias and pointer to input
- const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr());
- int32x4x4_t v_in =
+ int x = window_start_x;
+ for(; x < (window_end_x - window_step_x); x += window_step_x)
{
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+ int32x4x4_t v_in =
{
- wrapper::vloadq(in_ptr),
- wrapper::vloadq(in_ptr + 4),
- wrapper::vloadq(in_ptr + 8),
- wrapper::vloadq(in_ptr + 12),
+ {
+ wrapper::vloadq(in_ptr),
+ wrapper::vloadq(in_ptr + 4),
+ wrapper::vloadq(in_ptr + 8),
+ wrapper::vloadq(in_ptr + 12),
+ }
+ };
+
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+
+ wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
+ wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
+ wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
+ wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
}
- };
- // Accumulate bias
- if(has_bias)
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+ }
+
+ // Left-overs loop
+ for(; x < window_end_x; ++x)
{
- const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr());
+ // Get bias and pointer to input
+ const auto in_ptr = reinterpret_cast<int32_t *>(in.ptr()) + x;
+ int32_t s_in = *in_ptr;
- wrapper::vadd(v_in.val[0], wrapper::vloadq(bias_ptr));
- wrapper::vadd(v_in.val[1], wrapper::vloadq(bias_ptr + 4));
- wrapper::vadd(v_in.val[2], wrapper::vloadq(bias_ptr + 8));
- wrapper::vadd(v_in.val[3], wrapper::vloadq(bias_ptr + 12));
- }
+ // Accumulate bias
+ if(has_bias)
+ {
+ const auto bias_ptr = reinterpret_cast<int32_t *>(bi.ptr()) + x;
+ s_in += *bias_ptr;
+ }
- const auto out_ptr = reinterpret_cast<TOut *>(out.ptr());
- wrapper::vstore(out_ptr, finalize_quantization(v_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift_s32, min, max, false));
+ const auto out_ptr = reinterpret_cast<TOut *>(out.ptr()) + x;
+ *out_ptr = finalize_quantization(s_in, result_fixedpoint_multiplier, result_shift, result_offset_after_shift,
+ std::numeric_limits<TOut>::lowest(), std::numeric_limits<TOut>::max(), false);
+ }
},
in, bi, out);
}
@@ -352,10 +395,29 @@ void NEDirectConvolutionLayerOutputStageKernel::configure(ITensor *input, const
_result_shift = info.result_shift;
_result_offset_after_shift = info.result_offset_after_shift;
- // Configure kernel window
- auto win_config = validate_and_configure_window(input->info(), (bias == nullptr) ? nullptr : bias->info(), (output == nullptr) ? nullptr : output->info(), info);
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
+ // Auto-initialize output output if required
+ if(output != nullptr && output->info() != nullptr)
+ {
+ // Work out expected output data type
+ const DataType output_dt = (input->info()->data_type() == DataType::S32) ? info.output_data_type : DataType::S32;
+ // Output tensor auto initialization if not yet initialized
+ auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(output_dt));
+ }
+
+ Window win = calculate_max_window(*input->info(), Steps());
+ Coordinates coord;
+ coord.set_num_dimensions(input->info()->num_dimensions());
+
+ if(output != nullptr && (output->info()->total_size() != 0))
+ {
+ output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape()));
+ }
+ else
+ {
+ input->info()->set_valid_region(ValidRegion(coord, input->info()->tensor_shape()));
+ }
+
+ INEKernel::configure(win);
const bool has_bias = bias != nullptr;
const bool is_qasymm8_signed = (output != nullptr) ? is_data_type_quantized_asymmetric_signed(output->info()->data_type()) : false;
@@ -435,11 +497,6 @@ Status NEDirectConvolutionLayerOutputStageKernel::validate(const ITensorInfo *in
const DirectConvolutionLayerOutputStageKernelInfo &info)
{
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, info));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
- bias == nullptr ? nullptr : bias->clone().get(),
- output == nullptr ? nullptr : output->clone().get(),
- info)
- .first);
return Status{};
}