aboutsummaryrefslogtreecommitdiff
path: root/src/core/NEON/kernels/NEFillBorderKernel.cpp
diff options
context:
space:
mode:
authorPablo Tello <pablo.tello@arm.com>2017-08-09 16:33:49 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commit62eeae41442b5ca883836d4039208f56fb2407aa (patch)
treeca11be93be70b565f6696c0c4fb6726593c37dde /src/core/NEON/kernels/NEFillBorderKernel.cpp
parent25466a91f6058d994a5212f281e00f95c0fe8c2a (diff)
downloadComputeLibrary-62eeae41442b5ca883836d4039208f56fb2407aa.tar.gz
COMPMID-345: Optimization for NEFillBorder kernel.
It's about 0.8 faster than the old code for the special cases where left and top borders are both of size 1. This should improve a bit the performance of many kernels but specially in DirectConvolution where the kernel size is 3. Change-Id: I7d150cac4b1d9bf3bbf897ef6151e139fc34b39c Reviewed-on: http://mpd-gerrit.cambridge.arm.com/83403 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/NEFillBorderKernel.cpp')
-rw-r--r--src/core/NEON/kernels/NEFillBorderKernel.cpp71
1 files changed, 67 insertions, 4 deletions
diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp
index cd84e36aad..7d191c18b0 100644
--- a/src/core/NEON/kernels/NEFillBorderKernel.cpp
+++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp
@@ -39,6 +39,63 @@
using namespace arm_compute;
+namespace
+{
+template <typename T, unsigned int leftx, unsigned int rightx>
+void fill_constant_value_single_channel_special(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value);
+
+template <>
+inline void fill_constant_value_single_channel_special<float, 1u, 1u>(ITensor *tensor, const Window &window, unsigned int right, unsigned int bottom, const PixelValue &constant_border_value)
+{
+ float border_value;
+ constant_border_value.get(border_value);
+ uint8_t *const start_valid_region = tensor->ptr_to_element(tensor->info()->valid_region().anchor);
+ const size_t &width = tensor->info()->valid_region().shape[0];
+ const size_t &height = tensor->info()->valid_region().shape[1];
+ const int stridey = tensor->info()->strides_in_bytes()[1];
+
+ // Left and right border
+ Window vertical(window);
+ vertical.set(Window::DimY, Window::Dimension(0, height, 1));
+
+ Iterator vertical_it(tensor, vertical);
+
+ execute_window_loop(vertical, [&](const Coordinates &)
+ {
+ const auto row_start = reinterpret_cast<float *>(start_valid_region + vertical_it.offset());
+
+ // Fill left and right borders
+ *(row_start - 1) = border_value;
+ std::fill_n(row_start + width, right, border_value);
+ },
+ vertical_it);
+
+ // Top and bottom border
+ Iterator plane_it(tensor, window);
+
+ // Iterate over all XY planes
+ execute_window_loop(window, [&](const Coordinates &)
+ {
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
+ // Top border
+ const auto row_start = reinterpret_cast<float *>(base_addr - stridey);
+ // Fill top rows including left/right borders
+ std::fill_n(row_start - 1, 1 + width + right, border_value);
+
+ // Bottom border
+ const unsigned low_border_size = height + bottom;
+ for(unsigned int i = height; i < low_border_size; ++i)
+ {
+ const auto row_start = reinterpret_cast<float *>(base_addr + i * stridey);
+
+ // Fill bottom rows including left/right borders
+ std::fill_n(row_start - 1, 1 + width + right, border_value);
+ }
+ },
+ plane_it);
+}
+} // namespace
+
namespace arm_compute
{
class Coordinates;
@@ -112,7 +169,10 @@ void NEFillBorderKernel::run(const Window &window)
#endif /* ARM_COMPUTE_ENABLE_FP16 */
case DataType::F32:
static_assert(sizeof(float) == 4, "Float must be 32 bit");
- fill_constant_value_single_channel<float>(window);
+ if(_border_size.left == 1 && _border_size.top == 1)
+ fill_constant_value_single_channel_special<float, 1u, 1u>(_tensor, window, _border_size.right, _border_size.bottom, _constant_border_value);
+ else
+ fill_constant_value_single_channel<float>(window);
break;
default:
ARM_COMPUTE_ERROR("Not handled");
@@ -230,6 +290,7 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window
uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor);
const size_t &width = _tensor->info()->valid_region().shape[0];
const size_t &height = _tensor->info()->valid_region().shape[1];
+ const int stridey = _tensor->info()->strides_in_bytes()[1];
// Left and right border
Window vertical(window);
@@ -253,19 +314,21 @@ void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window
// Iterate over all XY planes
execute_window_loop(window, [&](const Coordinates & id)
{
+ uint8_t *base_addr = start_valid_region + plane_it.offset();
// Top border
for(int i = -_border_size.top; i < 0; ++i)
{
- const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+ const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
// Fill top rows including left/right borders
std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);
}
// Bottom border
- for(unsigned int i = height; i < height + _border_size.bottom; ++i)
+ const unsigned low_border_size = height + _border_size.bottom;
+ for(unsigned int i = height; i < low_border_size; ++i)
{
- const auto row_start = reinterpret_cast<T *>(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]);
+ const auto row_start = reinterpret_cast<T *>(base_addr + i * stridey);
// Fill bottom rows including left/right borders
std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value);