From 1b993389a3ac0cd1b0edc0b11e92fbdee127576f Mon Sep 17 00:00:00 2001 From: Abe Mbise Date: Tue, 19 Dec 2017 13:51:59 +0000 Subject: COMPMID-568: Implement Canny edge function for CL/NEON Change-Id: Ic5f197463f962bac4b23663bcef7ac744be6fc2a Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/114250 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- src/core/CL/cl_kernels/canny.cl | 11 ++++--- src/core/NEON/kernels/NECannyEdgeKernel.cpp | 50 ++++++++++++++--------------- src/runtime/CL/functions/CLCannyEdge.cpp | 15 ++++++--- src/runtime/NEON/functions/NECannyEdge.cpp | 19 ++++++----- 4 files changed, 52 insertions(+), 43 deletions(-) (limited to 'src') diff --git a/src/core/CL/cl_kernels/canny.cl b/src/core/CL/cl_kernels/canny.cl index 166d681755..f60359f0f4 100644 --- a/src/core/CL/cl_kernels/canny.cl +++ b/src/core/CL/cl_kernels/canny.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -148,6 +148,9 @@ __kernel void combine_gradients_L2( vstore4(convert_uchar4_sat_rte(p), 0, angle.ptr); } +#define EDGE 255 +#define NO_EDGE 0 + /** Array that holds the relative coordinates offset for the neighbouring pixels. */ __constant short4 neighbours_coords[] = @@ -203,6 +206,7 @@ __kernel void suppress_non_maximum( DATA_TYPE_IN gradient = *((__global DATA_TYPE_IN *)grad.ptr); uchar an = convert_ushort(*angle.ptr); + // Early return if not greater than lower threshold if(gradient <= lower_thr) { return; @@ -224,7 +228,6 @@ __kernel void suppress_non_maximum( } } -#define EDGE 255 #define hysteresis_local_stack_L1 8 // The size of level 1 stack. This has to agree with the host side #define hysteresis_local_stack_L2 16 // The size of level 2 stack, adjust this can impact the match rate with VX implementation @@ -333,7 +336,7 @@ kernel void hysteresis( // If less than upper threshold set to NO_EDGE and return if(val <= up_thr) { - *offset(&out, x, y) = 0; + *offset(&out, x, y) = NO_EDGE; return; } @@ -372,7 +375,7 @@ kernel void hysteresis( // Get direction pixel indices int N = max(y - 1, 0), S = min(y + 1, height - 2), W = max(x - 1, 0), E = min(x + 1, width - 2); - // Check 8 pixels around for week edges where low_thr < val <= up_thr + // Check 8 pixels around for weak edges where low_thr < val <= up_thr x_tmp = vload4(0, (__global DATA_TYPE_IN *)offset(&src, W, N)); v_tmp = vload4(0, (__global uint *)offset(&visited, W, N)); check_pixel(((x_tmp.s0 <= low_thr) || v_tmp.s0 || (x_tmp.s0 > up_thr)), W, N, x, y); // NW diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp index 9dfd580a25..dc37452415 100644 --- a/src/core/NEON/kernels/NECannyEdgeKernel.cpp +++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016, 2017 ARM Limited. + * Copyright (c) 2016-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -567,29 +567,29 @@ inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, co const uint32x4_t mk0_0 = vld1q_u32(in - 1); const uint32x4_t mk0_1 = vld1q_u32(in + 1); uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0)); - mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_0)); - mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_1)); + mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_0)); + mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_1)); // 45 degree const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1); const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1); uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1)); - mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_0)); - mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_1)); + mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_0)); + mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_1)); // 90 degree const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag); const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag); uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2)); - mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_0)); - mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_1)); + mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_0)); + mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_1)); // 135 degree const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1); const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1); uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3)); - mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_0)); - mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_1)); + mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_0)); + mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_1)); // Merge masks mask0 = vorrq_u32(mask0, mask1); @@ -1338,29 +1338,29 @@ void non_max_suppression_U16_U8_U8(const void *__restrict magnitude_ptr, const v const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1); const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1); uint16x8_t mask0 = vceqq_u16(pc16, vdupq_n_u16(0)); - mask0 = vandq_u16(mask0, vcgeq_u16(mc, mk0_0)); - mask0 = vandq_u16(mask0, vcgeq_u16(mc, mk0_1)); + mask0 = vandq_u16(mask0, vcgtq_u16(mc, mk0_0)); + mask0 = vandq_u16(mask0, vcgtq_u16(mc, mk0_1)); // 45 degree const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1); const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1); uint16x8_t mask1 = vceqq_u16(pc16, vdupq_n_u16(1)); - mask1 = vandq_u16(mask1, vcgeq_u16(mc, mk45_0)); - mask1 = vandq_u16(mask1, vcgeq_u16(mc, mk45_1)); + mask1 = vandq_u16(mask1, vcgtq_u16(mc, mk45_0)); + mask1 = vandq_u16(mask1, vcgtq_u16(mc, mk45_1)); // 90 degree const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag); const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag); uint16x8_t mask2 = vceqq_u16(pc16, vdupq_n_u16(2)); - mask2 = vandq_u16(mask2, vcgeq_u16(mc, mk90_0)); - mask2 = vandq_u16(mask2, vcgeq_u16(mc, mk90_1)); + mask2 = vandq_u16(mask2, vcgtq_u16(mc, mk90_0)); + mask2 = vandq_u16(mask2, vcgtq_u16(mc, mk90_1)); // 135 degree const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1); const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1); uint16x8_t mask3 = vceqq_u16(pc16, vdupq_n_u16(3)); - mask3 = vandq_u16(mask3, vcgeq_u16(mc, mk135_0)); - mask3 = vandq_u16(mask3, vcgeq_u16(mc, mk135_1)); + mask3 = vandq_u16(mask3, vcgtq_u16(mc, mk135_0)); + mask3 = vandq_u16(mask3, vcgtq_u16(mc, mk135_1)); // Merge masks mask0 = vorrq_u16(mask0, mask1); @@ -1399,29 +1399,29 @@ inline uint16x4_t non_max_U32_helper(const uint32_t *input, const uint16x4_t pc, const uint32x4_t mk0_0 = vld1q_u32(input - 1); const uint32x4_t mk0_1 = vld1q_u32(input + 1); uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0)); - mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_0)); - mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_1)); + mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_0)); + mask0 = vandq_u32(mask0, vcgtq_u32(mc, mk0_1)); // 45 degree const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1); const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1); uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1)); - mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_0)); - mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_1)); + mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_0)); + mask1 = vandq_u32(mask1, vcgtq_u32(mc, mk45_1)); // 90 degree const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag); const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag); uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2)); - mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_0)); - mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_1)); + mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_0)); + mask2 = vandq_u32(mask2, vcgtq_u32(mc, mk90_1)); // 135 degree const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1); const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1); uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3)); - mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_0)); - mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_1)); + mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_0)); + mask3 = vandq_u32(mask3, vcgtq_u32(mc, mk135_1)); // Merge masks mask0 = vorrq_u32(mask0, mask1); diff --git a/src/runtime/CL/functions/CLCannyEdge.cpp b/src/runtime/CL/functions/CLCannyEdge.cpp index 5acb8e7ddb..ed5834531e 100644 --- a/src/runtime/CL/functions/CLCannyEdge.cpp +++ b/src/runtime/CL/functions/CLCannyEdge.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -50,17 +50,23 @@ CLCannyEdge::CLCannyEdge(std::shared_ptr memory_manager) // NOLI _visited(), _recorded(), _l1_list_counter(), - _l1_stack() + _l1_stack(), + _output(nullptr) { } -void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value) +void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, + uint8_t constant_border_value) { + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type)); + ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7)); ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr); + _output = output; + const unsigned int L1_hysteresis_stack_size = 8; const TensorShape shape = input->info()->tensor_shape(); @@ -122,7 +128,7 @@ void CLCannyEdge::configure(ICLTensor *input, ICLTensor *output, int32_t upper_t } else { - ARM_COMPUTE_ERROR("Gradient %d size not supported", gradient_size); + ARM_COMPUTE_ERROR("Gradient size %d not supported", gradient_size); } // Manage intermediate buffers @@ -187,6 +193,7 @@ void CLCannyEdge::run() CLScheduler::get().enqueue(_non_max_suppr, false); // Clear temporary structures and run edge trace + _output->clear(CLScheduler::get().queue()); _visited.clear(CLScheduler::get().queue()); _recorded.clear(CLScheduler::get().queue()); _l1_list_counter.clear(CLScheduler::get().queue()); diff --git a/src/runtime/NEON/functions/NECannyEdge.cpp b/src/runtime/NEON/functions/NECannyEdge.cpp index c27ff2f935..1d73148f47 100644 --- a/src/runtime/NEON/functions/NECannyEdge.cpp +++ b/src/runtime/NEON/functions/NECannyEdge.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -61,12 +61,12 @@ NECannyEdge::NECannyEdge(std::shared_ptr memory_manager) // NOLI void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, int32_t lower_thr, int32_t gradient_size, int32_t norm_type, BorderMode border_mode, uint8_t constant_border_value, bool use_fp16) { + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); - ARM_COMPUTE_ERROR_ON(gradient_size < 3); - ARM_COMPUTE_ERROR_ON(gradient_size > 7); - ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr); ARM_COMPUTE_ERROR_ON((1 != norm_type) && (2 != norm_type)); + ARM_COMPUTE_ERROR_ON((gradient_size != 3) && (gradient_size != 5) && (gradient_size != 7)); + ARM_COMPUTE_ERROR_ON(lower_thr > upper_thr); _output = output; @@ -119,7 +119,7 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, } else { - ARM_COMPUTE_ERROR("Gradient size not supported\n"); + ARM_COMPUTE_ERROR("Gradient size %d not supported\n", gradient_size); } // Manage intermediate buffers @@ -171,24 +171,23 @@ void NECannyEdge::configure(ITensor *input, ITensor *output, int32_t upper_thr, void NECannyEdge::run() { ARM_COMPUTE_ERROR_ON_MSG(_sobel == nullptr, "Unconfigured function"); - ARM_COMPUTE_ERROR_ON(_output == nullptr); _memory_group.acquire(); // Run sobelNxN _sobel->run(); - // Fill border before non-maxima suppression. Nop for border mode undefined. - NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ); - // Run gradient NEScheduler::get().schedule(_gradient.get(), Window::DimY); + // Fill border before non-maxima suppression. Nop for border mode undefined. + NEScheduler::get().schedule(&_border_mag_gradient, Window::DimZ); + // Run non-maxima suppression NEScheduler::get().schedule(&_non_max_suppr, Window::DimY); ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); - memset(_output->buffer(), 0, _output->info()->total_size()); + std::fill_n(_output->buffer(), _output->info()->total_size(), 0); // Fill border before edge trace NEScheduler::get().schedule(&_border_edge_trace, Window::DimZ); -- cgit v1.2.1