From 6ff3b19ee6120edf015fad8caab2991faa3070af Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Mon, 4 Sep 2017 18:44:23 +0100 Subject: COMPMID-344 Updated doxygen Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae --- .../NEON/kernels/NEAbsoluteDifferenceKernel.cpp | 211 +++ src/core/NEON/kernels/NEAccumulateKernel.cpp | 357 ++++ src/core/NEON/kernels/NEActivationLayerKernel.cpp | 302 ++++ .../NEON/kernels/NEArithmeticAdditionKernel.cpp | 378 ++++ .../NEON/kernels/NEArithmeticSubtractionKernel.cpp | 371 ++++ .../kernels/NEBatchNormalizationLayerKernel.cpp | 187 ++ src/core/NEON/kernels/NEBitwiseAndKernel.cpp | 109 ++ src/core/NEON/kernels/NEBitwiseNotKernel.cpp | 96 + src/core/NEON/kernels/NEBitwiseOrKernel.cpp | 109 ++ src/core/NEON/kernels/NEBitwiseXorKernel.cpp | 105 ++ src/core/NEON/kernels/NEBox3x3Kernel.cpp | 220 +++ src/core/NEON/kernels/NECannyEdgeKernel.cpp | 1856 ++++++++++++++++++++ src/core/NEON/kernels/NEChannelCombineKernel.cpp | 467 +++++ src/core/NEON/kernels/NEChannelExtractKernel.cpp | 354 ++++ src/core/NEON/kernels/NECol2ImKernel.cpp | 124 ++ src/core/NEON/kernels/NEColorConvertKernel.cpp | 582 ++++++ src/core/NEON/kernels/NEConvolutionKernel.cpp | 1618 +++++++++++++++++ .../kernels/NECumulativeDistributionKernel.cpp | 110 ++ src/core/NEON/kernels/NEDepthConcatenateKernel.cpp | 105 ++ src/core/NEON/kernels/NEDepthConvertKernel.cpp | 384 ++++ src/core/NEON/kernels/NEDerivativeKernel.cpp | 224 +++ src/core/NEON/kernels/NEDilateKernel.cpp | 126 ++ ...EDirectConvolutionLayerBiasAccumulateKernel.cpp | 207 +++ .../kernels/NEDirectConvolutionLayerKernel.cpp | 817 +++++++++ src/core/NEON/kernels/NEErodeKernel.cpp | 126 ++ src/core/NEON/kernels/NEFastCornersKernel.cpp | 474 +++++ src/core/NEON/kernels/NEFillArrayKernel.cpp | 91 + src/core/NEON/kernels/NEFillBorderKernel.cpp | 259 +++ src/core/NEON/kernels/NEFillInnerBorderKernel.cpp | 137 ++ .../NEON/kernels/NEGEMMInterleave4x4Kernel.cpp | 191 ++ .../kernels/NEGEMMLowpMatrixMultiplyKernel.cpp | 423 +++++ .../kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp | 128 ++ .../NEON/kernels/NEGEMMMatrixAdditionKernel.cpp | 202 +++ .../NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp | 1168 ++++++++++++ src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp | 150 ++ src/core/NEON/kernels/NEGaussian3x3Kernel.cpp | 132 ++ src/core/NEON/kernels/NEGaussian5x5Kernel.cpp | 203 +++ src/core/NEON/kernels/NEGaussianPyramidKernel.cpp | 279 +++ src/core/NEON/kernels/NEHOGDescriptorKernel.cpp | 802 +++++++++ src/core/NEON/kernels/NEHOGDetectorKernel.cpp | 186 ++ src/core/NEON/kernels/NEHarrisCornersKernel.cpp | 1137 ++++++++++++ src/core/NEON/kernels/NEHistogramKernel.cpp | 252 +++ src/core/NEON/kernels/NEIm2ColKernel.cpp | 338 ++++ src/core/NEON/kernels/NEIntegralImageKernel.cpp | 141 ++ src/core/NEON/kernels/NELKTrackerKernel.cpp | 533 ++++++ .../NELocallyConnectedMatrixMultiplyKernel.cpp | 226 +++ src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp | 869 +++++++++ src/core/NEON/kernels/NEMeanStdDevKernel.cpp | 152 ++ src/core/NEON/kernels/NEMedian3x3Kernel.cpp | 135 ++ src/core/NEON/kernels/NEMinMaxLocationKernel.cpp | 361 ++++ src/core/NEON/kernels/NENonLinearFilterKernel.cpp | 1009 +++++++++++ .../kernels/NENonMaximaSuppression3x3Kernel.cpp | 513 ++++++ .../NEON/kernels/NENormalizationLayerKernel.cpp | 218 +++ .../kernels/NEPixelWiseMultiplicationKernel.cpp | 524 ++++++ src/core/NEON/kernels/NEPoolingLayerKernel.cpp | 415 +++++ src/core/NEON/kernels/NERemapKernel.cpp | 226 +++ src/core/NEON/kernels/NEScaleKernel.cpp | 359 ++++ src/core/NEON/kernels/NEScharr3x3Kernel.cpp | 259 +++ src/core/NEON/kernels/NESobel3x3Kernel.cpp | 269 +++ src/core/NEON/kernels/NESobel5x5Kernel.cpp | 402 +++++ src/core/NEON/kernels/NESobel7x7Kernel.cpp | 520 ++++++ src/core/NEON/kernels/NESoftmaxLayerKernel.cpp | 474 +++++ src/core/NEON/kernels/NETableLookupKernel.cpp | 142 ++ src/core/NEON/kernels/NEThresholdKernel.cpp | 129 ++ src/core/NEON/kernels/NETransposeKernel.cpp | 241 +++ src/core/NEON/kernels/NEWarpKernel.cpp | 651 +++++++ src/core/NEON/kernels/NEWeightsReshapeKernel.cpp | 175 ++ 67 files changed, 25040 insertions(+) create mode 100644 src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp create mode 100644 src/core/NEON/kernels/NEAccumulateKernel.cpp create mode 100644 src/core/NEON/kernels/NEActivationLayerKernel.cpp create mode 100644 src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp create mode 100644 src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp create mode 100644 src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp create mode 100644 src/core/NEON/kernels/NEBitwiseAndKernel.cpp create mode 100644 src/core/NEON/kernels/NEBitwiseNotKernel.cpp create mode 100644 src/core/NEON/kernels/NEBitwiseOrKernel.cpp create mode 100644 src/core/NEON/kernels/NEBitwiseXorKernel.cpp create mode 100644 src/core/NEON/kernels/NEBox3x3Kernel.cpp create mode 100644 src/core/NEON/kernels/NECannyEdgeKernel.cpp create mode 100644 src/core/NEON/kernels/NEChannelCombineKernel.cpp create mode 100644 src/core/NEON/kernels/NEChannelExtractKernel.cpp create mode 100644 src/core/NEON/kernels/NECol2ImKernel.cpp create mode 100644 src/core/NEON/kernels/NEColorConvertKernel.cpp create mode 100644 src/core/NEON/kernels/NEConvolutionKernel.cpp create mode 100644 src/core/NEON/kernels/NECumulativeDistributionKernel.cpp create mode 100644 src/core/NEON/kernels/NEDepthConcatenateKernel.cpp create mode 100644 src/core/NEON/kernels/NEDepthConvertKernel.cpp create mode 100644 src/core/NEON/kernels/NEDerivativeKernel.cpp create mode 100644 src/core/NEON/kernels/NEDilateKernel.cpp create mode 100644 src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp create mode 100644 src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp create mode 100644 src/core/NEON/kernels/NEErodeKernel.cpp create mode 100644 src/core/NEON/kernels/NEFastCornersKernel.cpp create mode 100644 src/core/NEON/kernels/NEFillArrayKernel.cpp create mode 100644 src/core/NEON/kernels/NEFillBorderKernel.cpp create mode 100644 src/core/NEON/kernels/NEFillInnerBorderKernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp create mode 100644 src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp create mode 100644 src/core/NEON/kernels/NEGaussian3x3Kernel.cpp create mode 100644 src/core/NEON/kernels/NEGaussian5x5Kernel.cpp create mode 100644 src/core/NEON/kernels/NEGaussianPyramidKernel.cpp create mode 100644 src/core/NEON/kernels/NEHOGDescriptorKernel.cpp create mode 100644 src/core/NEON/kernels/NEHOGDetectorKernel.cpp create mode 100644 src/core/NEON/kernels/NEHarrisCornersKernel.cpp create mode 100644 src/core/NEON/kernels/NEHistogramKernel.cpp create mode 100644 src/core/NEON/kernels/NEIm2ColKernel.cpp create mode 100644 src/core/NEON/kernels/NEIntegralImageKernel.cpp create mode 100644 src/core/NEON/kernels/NELKTrackerKernel.cpp create mode 100644 src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp create mode 100644 src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp create mode 100644 src/core/NEON/kernels/NEMeanStdDevKernel.cpp create mode 100644 src/core/NEON/kernels/NEMedian3x3Kernel.cpp create mode 100644 src/core/NEON/kernels/NEMinMaxLocationKernel.cpp create mode 100644 src/core/NEON/kernels/NENonLinearFilterKernel.cpp create mode 100644 src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp create mode 100644 src/core/NEON/kernels/NENormalizationLayerKernel.cpp create mode 100644 src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp create mode 100644 src/core/NEON/kernels/NEPoolingLayerKernel.cpp create mode 100644 src/core/NEON/kernels/NERemapKernel.cpp create mode 100644 src/core/NEON/kernels/NEScaleKernel.cpp create mode 100644 src/core/NEON/kernels/NEScharr3x3Kernel.cpp create mode 100644 src/core/NEON/kernels/NESobel3x3Kernel.cpp create mode 100644 src/core/NEON/kernels/NESobel5x5Kernel.cpp create mode 100644 src/core/NEON/kernels/NESobel7x7Kernel.cpp create mode 100644 src/core/NEON/kernels/NESoftmaxLayerKernel.cpp create mode 100644 src/core/NEON/kernels/NETableLookupKernel.cpp create mode 100644 src/core/NEON/kernels/NEThresholdKernel.cpp create mode 100644 src/core/NEON/kernels/NETransposeKernel.cpp create mode 100644 src/core/NEON/kernels/NEWarpKernel.cpp create mode 100644 src/core/NEON/kernels/NEWeightsReshapeKernel.cpp (limited to 'src/core/NEON') diff --git a/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp new file mode 100644 index 0000000000..edb0a0f304 --- /dev/null +++ b/src/core/NEON/kernels/NEAbsoluteDifferenceKernel.cpp @@ -0,0 +1,211 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +void abs_diff_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t input1_val = vld1q_u8(input1.ptr()); + const uint8x16_t input2_val = vld1q_u8(input2.ptr()); + + vst1q_u8(output.ptr(), vabdq_u8(input1_val, input2_val)); + }, + input1, input2, output); +} + +inline int16x8x2_t vqabd2q_s16(const int16x8x2_t &v1, const int16x8x2_t &v2) +{ + const int16x8x2_t res = + { + { + vqabsq_s16(vqsubq_s16(v1.val[0], v2.val[0])), + vqabsq_s16(vqsubq_s16(v1.val[1], v2.val[1])) + } + }; + + return res; +} + +void abs_diff_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + int16x8x2_t input1_val = vld2q_s16(reinterpret_cast(input1.ptr())); + int16x8x2_t input2_val = vld2q_s16(reinterpret_cast(input2.ptr())); + vst2q_s16(reinterpret_cast(output.ptr()), vqabd2q_s16(input1_val, input2_val)); + }, + input1, input2, output); +} + +void abs_diff_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t input1_val = vld1q_u8(input1.ptr()); + const int16x8x2_t input2_val = + { + { + vld1q_s16(reinterpret_cast(input2.ptr())), + vld1q_s16(reinterpret_cast(input2.ptr()) + 8) + } + }; + + const int16x8x2_t out_val = + { + { + vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(input1_val))), input2_val.val[0])), + vqabsq_s16(vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(input1_val))), input2_val.val[1])) + } + }; + + vst1q_s16(reinterpret_cast(output.ptr()), out_val.val[0]); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, out_val.val[1]); + + }, + input1, input2, output); +} + +void abs_diff_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + abs_diff_U8_S16_S16(in2, in1, out, window); +} +} // namespace + +NEAbsoluteDifferenceKernel::NEAbsoluteDifferenceKernel() + : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void NEAbsoluteDifferenceKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + + set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); + + if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16) + { + set_format_if_unknown(*output->info(), Format::S16); + } + else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::U8); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "The output image can only be U8 if both input images are U8"); + + _input1 = input1; + _input2 = input2; + _output = output; + + const DataType input1_data_type = input1->info()->data_type(); + const DataType input2_data_type = input2->info()->data_type(); + + if(input1_data_type == input2_data_type) + { + if(input1_data_type == DataType::U8) + { + _func = &abs_diff_U8_U8_U8; + } + else + { + _func = &abs_diff_S16_S16_S16; + } + } + else + { + if(input1_data_type == DataType::U8) + { + _func = &abs_diff_U8_S16_S16; + } + else + { + _func = &abs_diff_S16_U8_S16; + } + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEAbsoluteDifferenceKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + _func(_input1, _input2, _output, window); +} diff --git a/src/core/NEON/kernels/NEAccumulateKernel.cpp b/src/core/NEON/kernels/NEAccumulateKernel.cpp new file mode 100644 index 0000000000..e5b933a781 --- /dev/null +++ b/src/core/NEON/kernels/NEAccumulateKernel.cpp @@ -0,0 +1,357 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +/* Max S16 value used for saturation purposes. */ +const static uint16x8_t max_int_u16 = vdupq_n_u16(static_cast(INT16_MAX)); + +#ifdef ARM_COMPUTE_ENABLE_FP16 +namespace fp16 +{ +inline float16x8x2_t convert_u8x16_to_f16x8x2(uint8x16_t input) +{ + const float16x8x2_t out = + { + { + vcvtq_f16_u16(vmovl_u8(vget_low_u8(input))), + vcvtq_f16_u16(vmovl_u8(vget_high_u8(input))) + } + }; + + return out; +} + +inline uint8x16_t convert_f16x8x2_to_u8x16(const float16x8x2_t &input) +{ + return vcombine_u8(vmovn_u16(vcvtq_u16_f16(input.val[0])), + vmovn_u16(vcvtq_u16_f16(input.val[1]))); +} + +inline float16x8x2_t vector_accumulate_weighted(const float16x8x2_t &vec0, const float16x8x2_t &vec1, float16x8_t scale_val, float16x8_t scale_val2) +{ + const float16x8x2_t res = + { + { + vfmaq_f16(vmulq_f16(vec1.val[0], scale_val), vec0.val[0], scale_val2), + vfmaq_f16(vmulq_f16(vec1.val[1], scale_val), vec0.val[1], scale_val2) + } + }; + + return res; +} + +void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, float16x8_t scale_val, float16x8_t scale_val2) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == accum); + + const auto input_ptr = static_cast(input); + const auto accum_ptr = static_cast(accum); + + const uint8x16x4_t input_buffer = vld4q_u8(input_ptr); + uint8x16x4_t accum_buffer = vld4q_u8(accum_ptr); + + const float16x8x2_t f16_input_0 = convert_u8x16_to_f16x8x2(input_buffer.val[0]); + const float16x8x2_t f16_input_1 = convert_u8x16_to_f16x8x2(input_buffer.val[1]); + const float16x8x2_t f16_input_2 = convert_u8x16_to_f16x8x2(input_buffer.val[2]); + const float16x8x2_t f16_input_3 = convert_u8x16_to_f16x8x2(input_buffer.val[3]); + + float16x8x2_t f16_accum_0 = convert_u8x16_to_f16x8x2(accum_buffer.val[0]); + float16x8x2_t f16_accum_1 = convert_u8x16_to_f16x8x2(accum_buffer.val[1]); + float16x8x2_t f16_accum_2 = convert_u8x16_to_f16x8x2(accum_buffer.val[2]); + float16x8x2_t f16_accum_3 = convert_u8x16_to_f16x8x2(accum_buffer.val[3]); + + f16_accum_0 = vector_accumulate_weighted(f16_input_0, f16_accum_0, scale_val, scale_val2); + f16_accum_1 = vector_accumulate_weighted(f16_input_1, f16_accum_1, scale_val, scale_val2); + f16_accum_2 = vector_accumulate_weighted(f16_input_2, f16_accum_2, scale_val, scale_val2); + f16_accum_3 = vector_accumulate_weighted(f16_input_3, f16_accum_3, scale_val, scale_val2); + + accum_buffer = { { + convert_f16x8x2_to_u8x16(f16_accum_0), + convert_f16x8x2_to_u8x16(f16_accum_1), + convert_f16x8x2_to_u8x16(f16_accum_2), + convert_f16x8x2_to_u8x16(f16_accum_3) + } + }; + + vst4q_u8(accum_ptr, accum_buffer); +} +} // namespace fp16 + +void NEAccumulateWeightedFP16Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator input(_input, window); + Iterator accum(_output, window); + + const float16x8_t scale_val = vdupq_n_f16(1.f - _alpha); + const float16x8_t scale_val2 = vdupq_n_f16(_alpha); + + execute_window_loop(window, [&](const Coordinates & id) + { + fp16::acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2); + }, + input, accum); +} +#endif + +namespace +{ +inline void acc_v16_u8(const void *__restrict input, void *__restrict accum) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == accum); + + const auto in = static_cast(input); + const auto out = static_cast(accum); + + uint8x16_t ta1 = vld1q_u8(in); + int16x8_t ta2 = vld1q_s16(out); + int16x8_t ta3 = vld1q_s16(out + 8); + + ta2 = vqaddq_s16(ta2, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(ta1)))); + ta3 = vqaddq_s16(ta3, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(ta1)))); + + vst1q_s16(out, ta2); + vst1q_s16(out + 8, ta3); +} + +inline float32x4x4_t convert_u8x16_to_f32x4x4(uint8x16_t input) +{ + const uint16x8_t u16_output_low = vmovl_u8(vget_low_u8(input)); + const uint16x8_t u16_output_hi = vmovl_u8(vget_high_u8(input)); + + const float32x4x4_t res = + { + { + vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_low))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_low))), + vcvtq_f32_u32(vmovl_u16(vget_low_u16(u16_output_hi))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(u16_output_hi))) + } + }; + + return res; +} + +inline uint8x16_t convert_f32x4x4_to_u8x16(const float32x4x4_t &input) +{ + return vcombine_u8(vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[0])), + vmovn_u32(vcvtq_u32_f32(input.val[1])))), + vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(input.val[2])), + vmovn_u32(vcvtq_u32_f32(input.val[3]))))); +} + +inline float32x4x4_t vector_accumulate_weighted(const float32x4x4_t &vector_input, float32x4x4_t vector_output, float32x4_t scale_val, float32x4_t scale_val2) +{ + vector_output.val[0] = vmulq_f32(vector_output.val[0], scale_val); + vector_output.val[1] = vmulq_f32(vector_output.val[1], scale_val); + vector_output.val[2] = vmulq_f32(vector_output.val[2], scale_val); + vector_output.val[3] = vmulq_f32(vector_output.val[3], scale_val); + + vector_output.val[0] = vmlaq_f32(vector_output.val[0], vector_input.val[0], scale_val2); + vector_output.val[1] = vmlaq_f32(vector_output.val[1], vector_input.val[1], scale_val2); + vector_output.val[2] = vmlaq_f32(vector_output.val[2], vector_input.val[2], scale_val2); + vector_output.val[3] = vmlaq_f32(vector_output.val[3], vector_input.val[3], scale_val2); + + return vector_output; +} + +inline void acc_we_v16_u8(const void *__restrict input, void *__restrict accum, const float32x4_t scale_val, const float32x4_t scale_val2) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == accum); + + const auto input_ptr = static_cast(input); + const auto accum_ptr = static_cast(accum); + + const uint8x16_t input_buffer = vld1q_u8(input_ptr); + const uint8x16_t accum_buffer = vld1q_u8(accum_ptr); + + const float32x4x4_t f32_input_0 = convert_u8x16_to_f32x4x4(input_buffer); + const float32x4x4_t f32_output_0 = convert_u8x16_to_f32x4x4(accum_buffer); + + const float32x4x4_t f32_res_0 = vector_accumulate_weighted(f32_input_0, f32_output_0, scale_val, scale_val2); + + vst1q_u8(accum_ptr, convert_f32x4x4_to_u8x16(f32_res_0)); +} + +void acc_sq_v16_u8(const void *__restrict input, uint32_t shift, void *__restrict accum) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == accum); + ARM_COMPUTE_ERROR_ON(shift > 15); + + const auto input_buffer = static_cast(input); + const auto accum_buffer = static_cast(accum); + + const uint8x16_t ta1 = vld1q_u8(input_buffer); + uint16x8_t ta2 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer)); + uint16x8_t ta3 = vreinterpretq_u16_s16(vld1q_s16(accum_buffer + 8)); + + const int16x8_t vector_shift = vdupq_n_s16(-static_cast(shift)); + + uint16x8_t linput = vmovl_u8(vget_low_u8(ta1)); + uint16x8_t hinput = vmovl_u8(vget_high_u8(ta1)); + + linput = vmulq_u16(linput, linput); + hinput = vmulq_u16(hinput, hinput); + + linput = vqshlq_u16(linput, vector_shift); + hinput = vqshlq_u16(hinput, vector_shift); + + ta2 = vqaddq_u16(ta2, linput); + ta3 = vqaddq_u16(ta3, hinput); + + vst1q_s16(accum_buffer, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta2))); + vst1q_s16(accum_buffer + 8, vreinterpretq_s16_u16(vminq_u16(max_int_u16, ta3))); +} +} // namespace + +void NEAccumulateKernel::configure(const ITensor *input, ITensor *accum) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum); + + set_shape_if_empty(*accum->info(), input->info()->tensor_shape()); + + set_format_if_unknown(*accum->info(), Format::S16); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum); + + constexpr unsigned int num_elems_processed_per_iteration = 16; + INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration); +} + +void NEAccumulateKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + Iterator input(_input, window); + Iterator accum(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + acc_v16_u8(input.ptr(), accum.ptr()); + }, + input, accum); +} + +NEAccumulateWeightedKernel::NEAccumulateWeightedKernel() + : _alpha(0.0f) +{ +} + +void NEAccumulateWeightedKernel::configure(const ITensor *input, float alpha, ITensor *accum) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum); + + set_shape_if_empty(*accum->info(), input->info()->tensor_shape()); + + set_format_if_unknown(*accum->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(alpha < 0.0 || alpha > 1.0); + + _alpha = alpha; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration); +} + +void NEAccumulateWeightedKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator input(_input, window); + Iterator accum(_output, window); + + const float32x4_t scale_val = vdupq_n_f32(1.f - _alpha); + const float32x4_t scale_val2 = vdupq_n_f32(_alpha); + + execute_window_loop(window, [&](const Coordinates & id) + { + acc_we_v16_u8(input.ptr(), accum.ptr(), scale_val, scale_val2); + }, + input, accum); +} + +NEAccumulateSquaredKernel::NEAccumulateSquaredKernel() + : _shift(0) +{ +} + +void NEAccumulateSquaredKernel::configure(const ITensor *input, uint32_t shift, ITensor *accum) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, accum); + + set_shape_if_empty(*accum->info(), input->info()->tensor_shape()); + + set_format_if_unknown(*accum->info(), Format::S16); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, accum); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON(shift > 15); + + _shift = shift; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + INESimpleKernel::configure(input, accum, num_elems_processed_per_iteration); +} + +void NEAccumulateSquaredKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + Iterator input(_input, window); + Iterator accum(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + acc_sq_v16_u8(input.ptr(), _shift, accum.ptr()); + }, + input, accum); +} diff --git a/src/core/NEON/kernels/NEActivationLayerKernel.cpp b/src/core/NEON/kernels/NEActivationLayerKernel.cpp new file mode 100644 index 0000000000..a878078007 --- /dev/null +++ b/src/core/NEON/kernels/NEActivationLayerKernel.cpp @@ -0,0 +1,302 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" + +#include "arm_compute/core/FixedPoint.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +NEActivationLayerKernel::NEActivationLayerKernel() + : _func(nullptr), _act_info(ActivationFunction::LOGISTIC) +{ +} + +void NEActivationLayerKernel::configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), input->info()->tensor_shape(), 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + // Activation functions : FP32 + static std::map act_map_f32 = + { + { ActivationFunction::ABS, &NEActivationLayerKernel::activation }, + { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation }, + { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation }, + { ActivationFunction::RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::SQRT, &NEActivationLayerKernel::activation }, + { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation }, + { ActivationFunction::TANH, &NEActivationLayerKernel::activation }, + }; + + // Activation functions : QS8 + static std::map act_map_qs8 = + { + { ActivationFunction::ABS, &NEActivationLayerKernel::activation }, + { ActivationFunction::LINEAR, &NEActivationLayerKernel::activation }, + { ActivationFunction::LOGISTIC, &NEActivationLayerKernel::activation }, + { ActivationFunction::RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::BOUNDED_RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::SOFT_RELU, &NEActivationLayerKernel::activation }, + { ActivationFunction::SQRT, &NEActivationLayerKernel::activation }, + { ActivationFunction::SQUARE, &NEActivationLayerKernel::activation }, + { ActivationFunction::TANH, &NEActivationLayerKernel::activation }, + }; + + _input = input; + _output = output; + _act_info = activation_info; + switch(input->info()->data_type()) + { + case DataType::F32: + _func = act_map_f32[activation_info.activation()]; + break; + case DataType::QS8: + _func = act_map_qs8[activation_info.activation()]; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + INESimpleKernel::configure(_input, _output, num_elems_processed_per_iteration); +} + +template +typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) +{ + Iterator input(_input, window); + Iterator output(_output, window); + + static const float32x4_t CONST_1 = vdupq_n_f32(1.f); + static const float32x4_t CONST_0 = vdupq_n_f32(0.f); + const float32x4_t a = vdupq_n_f32(_act_info.a()); + const float32x4_t b = vdupq_n_f32(_act_info.b()); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const float32x4x4_t in = vld4q_f32(input_ptr); + float32x4x4_t tmp = { {} }; + + switch(F) + { + case ActivationFunction::ABS: + tmp = + { + { + vabsq_f32(in.val[0]), + vabsq_f32(in.val[1]), + vabsq_f32(in.val[2]), + vabsq_f32(in.val[3]), + } + }; + break; + case ActivationFunction::BOUNDED_RELU: + tmp = + { + { + vminq_f32(a, vmaxq_f32(CONST_0, in.val[0])), + vminq_f32(a, vmaxq_f32(CONST_0, in.val[1])), + vminq_f32(a, vmaxq_f32(CONST_0, in.val[2])), + vminq_f32(a, vmaxq_f32(CONST_0, in.val[3])), + } + }; + break; + case ActivationFunction::LINEAR: + tmp = + { + { + vmlaq_f32(b, a, in.val[0]), + vmlaq_f32(b, a, in.val[1]), + vmlaq_f32(b, a, in.val[2]), + vmlaq_f32(b, a, in.val[3]), + } + }; + break; + case ActivationFunction::LOGISTIC: + tmp = + { + { + vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[0])))), + vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[1])))), + vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[2])))), + vinvq_f32(vaddq_f32(CONST_1, vexpq_f32(vnegq_f32(in.val[3])))), + } + }; + break; + case ActivationFunction::RELU: + tmp = + { + { + vmaxq_f32(CONST_0, in.val[0]), + vmaxq_f32(CONST_0, in.val[1]), + vmaxq_f32(CONST_0, in.val[2]), + vmaxq_f32(CONST_0, in.val[3]), + } + }; + break; + case ActivationFunction::SOFT_RELU: + tmp = + { + { + vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[0]))), + vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[1]))), + vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[2]))), + vlogq_f32(vaddq_f32(CONST_1, vexpq_f32(in.val[3]))), + } + }; + break; + case ActivationFunction::SQRT: + tmp = + { + { + vinvq_f32(vinvsqrtq_f32(in.val[0])), + vinvq_f32(vinvsqrtq_f32(in.val[1])), + vinvq_f32(vinvsqrtq_f32(in.val[2])), + vinvq_f32(vinvsqrtq_f32(in.val[3])), + } + }; + break; + case ActivationFunction::SQUARE: + tmp = + { + { + vmulq_f32(in.val[0], in.val[0]), + vmulq_f32(in.val[1], in.val[1]), + vmulq_f32(in.val[2], in.val[2]), + vmulq_f32(in.val[3], in.val[3]), + } + }; + break; + case ActivationFunction::TANH: + tmp = + { + { + vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[0]))), + vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[1]))), + vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[2]))), + vmulq_f32(a, vtanhq_f32(vmulq_f32(b, in.val[3]))), + } + }; + break; + default: + break; + } + + vst4q_f32(output_ptr, tmp); + }, + input, output); +} + +template +typename std::enable_if::value, void>::type NEActivationLayerKernel::activation(const Window &window) +{ + Iterator input(_input, window); + Iterator output(_output, window); + int fixed_point_position = _input->info()->fixed_point_position(); + + static const qint8x16_t CONST_0 = vdupq_n_qs8(0); + const qint8x16_t CONST_1 = vdupq_n_qs8(scvt_qs8_f32(1.f, fixed_point_position)); + const qint8x16_t a = vdupq_n_qs8(scvt_qs8_f32(_act_info.a(), fixed_point_position)); + const qint8x16_t b = vdupq_n_qs8(scvt_qs8_f32(_act_info.b(), fixed_point_position)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto input_ptr = reinterpret_cast(input.ptr()); + const auto output_ptr = reinterpret_cast(output.ptr()); + + const qint8x16_t in = vld1q_qs8(input_ptr); + qint8x16_t tmp = {}; + + switch(F) + { + case ActivationFunction::ABS: + tmp = vqabsq_qs8(in); + break; + case ActivationFunction::BOUNDED_RELU: + tmp = vminq_qs8(a, vmaxq_qs8(CONST_0, in)); + break; + case ActivationFunction::LINEAR: + tmp = vqmlaq_qs8(b, a, in, fixed_point_position); + break; + case ActivationFunction::LOGISTIC: + tmp = vrecipq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(vnegq_s8(in), fixed_point_position)), fixed_point_position); + break; + case ActivationFunction::RELU: + tmp = vmaxq_qs8(CONST_0, in); + break; + case ActivationFunction::SOFT_RELU: + tmp = vlogq_qs8(vqaddq_qs8(CONST_1, vqexpq_qs8(in, fixed_point_position)), fixed_point_position); + break; + case ActivationFunction::SQRT: + tmp = vrecipq_qs8(vinvsqrtq_qs8(in, fixed_point_position), fixed_point_position); + break; + case ActivationFunction::SQUARE: + tmp = vqmulq_qs8(in, in, fixed_point_position); + break; + case ActivationFunction::TANH: + tmp = vtanhq_qs8(in, fixed_point_position); + break; + default: + break; + } + + vst1q_qs8(output_ptr, tmp); + }, + input, output); +} + +void NEActivationLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp new file mode 100644 index 0000000000..a4fdad8a2a --- /dev/null +++ b/src/core/NEON/kernels/NEArithmeticAdditionKernel.cpp @@ -0,0 +1,378 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +void add_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + vst1q_u8(output.ptr(), vaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); + }, + input1, input2, output); +} + +void add_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + vst1q_u8(output.ptr(), vqaddq_u8(vld1q_u8(input1.ptr()), vld1q_u8(input2.ptr()))); + }, + input1, input2, output); +} + +inline int16x8x2_t vadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) +{ + const int16x8x2_t res = + { + { + vaddq_s16(a.val[0], b.val[0]), + vaddq_s16(a.val[1], b.val[1]) + } + }; + + return res; +} + +inline float32x4x4_t vadd4q_f32(const float32x4x4_t &a, const float32x4x4_t &b) +{ + const float32x4x4_t res = + { + { + vaddq_f32(a.val[0], b.val[0]), + vaddq_f32(a.val[1], b.val[1]), + vaddq_f32(a.val[2], b.val[2]), + vaddq_f32(a.val[3], b.val[3]) + } + }; + + return res; +} + +inline int16x8x2_t vqadd2q_s16(const int16x8x2_t &a, const int16x8x2_t &b) +{ + const int16x8x2_t res = + { + { + vqaddq_s16(a.val[0], b.val[0]), + vqaddq_s16(a.val[1], b.val[1]) + } + }; + + return res; +} + +void add_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x4_t a = vld4q_f32(reinterpret_cast(input1.ptr())); + const float32x4x4_t b = vld4q_f32(reinterpret_cast(input2.ptr())); + + vst4q_f32(reinterpret_cast(output.ptr()), vadd4q_f32(a, b)); + }, + input1, input2, output); +} + +void add_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t a = vld2q_s16(reinterpret_cast(input1.ptr())); + const int16x8x2_t b = vld2q_s16(reinterpret_cast(input2.ptr())); + + vst2q_s16(reinterpret_cast(output.ptr()), vadd2q_s16(a, b)); + }, + input1, input2, output); +} + +void add_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t a = vld2q_s16(reinterpret_cast(input1.ptr())); + const int16x8x2_t b = vld2q_s16(reinterpret_cast(input2.ptr())); + + vst2q_s16(reinterpret_cast(output.ptr()), vqadd2q_s16(a, b)); + }, + input1, input2, output); +} + +void add_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t a = + { + { + vld1q_s16(reinterpret_cast(input1.ptr())), + vld1q_s16(reinterpret_cast(input1.ptr()) + 8) + } + }; + const uint8x16_t b = vld1q_u8(input2.ptr()); + + vst1q_s16(reinterpret_cast(output.ptr()), vaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, vaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); + }, + input1, input2, output); +} + +void add_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t a = + { + { + vld1q_s16(reinterpret_cast(input1.ptr())), + vld1q_s16(reinterpret_cast(input1.ptr()) + 8) + } + }; + const uint8x16_t b = vld1q_u8(input2.ptr()); + + vst1q_s16(reinterpret_cast(output.ptr()), vqaddq_s16(a.val[0], vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))))); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqaddq_s16(a.val[1], vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))))); + }, + input1, input2, output); +} + +inline void add_wrap_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +{ + //Simply swap the two input buffers: + add_wrap_S16_U8_S16(input2, input1, output, window); +} + +inline void add_saturate_U8_S16_S16(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window) +{ + //Simply swap the two input buffers: + add_saturate_S16_U8_S16(input2, input1, output, window); +} + +void add_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t a = vld1q_u8(input1.ptr()); + const uint8x16_t b = vld1q_u8(input2.ptr()); + + const int16x8x2_t a_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + } + }; + + const int16x8x2_t b_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + } + }; + + vst1q_s16(reinterpret_cast(output.ptr()), vaddq_s16(a_s16.val[0], b_s16.val[0])); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, vaddq_s16(a_s16.val[1], b_s16.val[1])); + }, + input1, input2, output); +} + +void add_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t a = vld1q_u8(input1.ptr()); + const uint8x16_t b = vld1q_u8(input2.ptr()); + + const int16x8x2_t a_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(a))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(a))) + } + }; + + const int16x8x2_t b_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b))) + } + }; + + vst1q_s16(reinterpret_cast(output.ptr()), vqaddq_s16(a_s16.val[0], b_s16.val[0])); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, vqaddq_s16(a_s16.val[1], b_s16.val[1])); + }, + input1, input2, output); +} +} // namespace + +NEArithmeticAdditionKernel::NEArithmeticAdditionKernel() + : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void NEArithmeticAdditionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + + set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); + + if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16) + { + set_format_if_unknown(*output->info(), Format::S16); + } + else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + + static std::map map_function = + { + { "add_wrap_U8_U8_U8", &add_wrap_U8_U8_U8 }, + { "add_saturate_U8_U8_U8", &add_saturate_U8_U8_U8 }, + { "add_wrap_S16_U8_S16", &add_wrap_S16_U8_S16 }, + { "add_saturate_S16_U8_S16", &add_saturate_S16_U8_S16 }, + { "add_wrap_U8_S16_S16", &add_wrap_U8_S16_S16 }, + { "add_saturate_U8_S16_S16", &add_saturate_U8_S16_S16 }, + { "add_wrap_U8_U8_S16", &add_wrap_U8_U8_S16 }, + { "add_saturate_U8_U8_S16", &add_saturate_U8_U8_S16 }, + { "add_wrap_S16_S16_S16", &add_wrap_S16_S16_S16 }, + { "add_saturate_S16_S16_S16", &add_saturate_S16_S16_S16 }, + { "add_wrap_F32_F32_F32", &add_F32_F32_F32 }, + { "add_saturate_F32_F32_F32", &add_F32_F32_F32 }, + }; + + _input1 = input1; + _input2 = input2; + _output = output; + + std::string function_to_call("add_"); + function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_"; + function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; + function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + auto it = map_function.find(function_to_call); + + if(it != map_function.end()) + { + _func = it->second; + } + else + { + ARM_COMPUTE_ERROR("You called arithmetic addition with the wrong tensor data type"); + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEArithmeticAdditionKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input1, _input2, _output, window); +} diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp new file mode 100644 index 0000000000..d3e62b069e --- /dev/null +++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp @@ -0,0 +1,371 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +void sub_wrap_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t ta1 = vld1q_u8(input1.ptr()); + const uint8x16_t ta2 = vld1q_u8(input2.ptr()); + + vst1q_u8(output.ptr(), vsubq_u8(ta1, ta2)); + }, + input1, input2, output); +} + +void sub_saturate_U8_U8_U8(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t ta1 = vld1q_u8(input1.ptr()); + const uint8x16_t ta2 = vld1q_u8(input2.ptr()); + + vst1q_u8(output.ptr(), vqsubq_u8(ta1, ta2)); + }, + input1, input2, output); +} + +void sub_wrap_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast(input1.ptr())); + const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast(input2.ptr())); + + const int16x8x2_t ta3 = + { + { + vsubq_s16(ta1.val[0], ta2.val[0]), + vsubq_s16(ta1.val[1], ta2.val[1]) + } + }; + + vst2q_s16(reinterpret_cast(output.ptr()), ta3); + }, + input1, input2, output); +} + +void sub_saturate_S16_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t ta1 = vld2q_s16(reinterpret_cast(input1.ptr())); + const int16x8x2_t ta2 = vld2q_s16(reinterpret_cast(input2.ptr())); + + const int16x8x2_t ta3 = + { + { + vqsubq_s16(ta1.val[0], ta2.val[0]), + vqsubq_s16(ta1.val[1], ta2.val[1]) + } + }; + + vst2q_s16(reinterpret_cast(output.ptr()), ta3); + }, + input1, input2, output); +} + +void sub_F32_F32_F32(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x4_t ta1 = vld4q_f32(reinterpret_cast(input1.ptr())); + const float32x4x4_t ta2 = vld4q_f32(reinterpret_cast(input2.ptr())); + + const float32x4x4_t ta3 = + { + { + vsubq_f32(ta1.val[0], ta2.val[0]), + vsubq_f32(ta1.val[1], ta2.val[1]), + vsubq_f32(ta1.val[2], ta2.val[2]), + vsubq_f32(ta1.val[3], ta2.val[3]), + } + }; + + vst4q_f32(reinterpret_cast(output.ptr()), ta3); + }, + input1, input2, output); +} +void sub_wrap_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t bv_0 = vld1q_u8(input2.ptr()); + int16x8_t a1_0 = vld1q_s16(reinterpret_cast(input1.ptr())); + int16x8_t a2_0 = vld1q_s16(reinterpret_cast(input1.ptr()) + 8); + + a1_0 = vsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0)))); + a2_0 = vsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0)))); + + vst1q_s16(reinterpret_cast(output.ptr()), a1_0); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, a2_0); + }, + input1, input2, output); +} + +void sub_saturate_S16_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t bv_0 = vld1q_u8(input2.ptr()); + int16x8_t a1_0 = vld1q_s16(reinterpret_cast(input1.ptr())); + int16x8_t a2_0 = vld1q_s16(reinterpret_cast(input1.ptr()) + 8); + + a1_0 = vqsubq_s16(a1_0, vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0)))); + a2_0 = vqsubq_s16(a2_0, vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0)))); + + vst1q_s16(reinterpret_cast(output.ptr()), a1_0); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, a2_0); + }, + input1, input2, output); +} + +void sub_wrap_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t bv_0 = vld1q_u8(input1.ptr()); + int16x8_t a1_0 = vld1q_s16(reinterpret_cast(input2.ptr())); + int16x8_t a2_0 = vld1q_s16(reinterpret_cast(input2.ptr()) + 8); + + a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0); + a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0); + + vst1q_s16(reinterpret_cast(output.ptr()), a1_0); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, a2_0); + }, + input1, input2, output); +} + +void sub_saturate_U8_S16_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t bv_0 = vld1q_u8(input1.ptr()); + int16x8_t a1_0 = vld1q_s16(reinterpret_cast(input2.ptr())); + int16x8_t a2_0 = vld1q_s16(reinterpret_cast(input2.ptr()) + 8); + + a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0))), a1_0); + a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0))), a2_0); + + vst1q_s16(reinterpret_cast(output.ptr()), a1_0); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, a2_0); + }, + input1, input2, output); +} + +void sub_wrap_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t av_0 = vld1q_u8(input1.ptr()); + const uint8x16_t bv_0 = vld1q_u8(input2.ptr()); + + const int16x8_t a1_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))), + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0)))); + const int16x8_t a2_0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0)))); + + vst1q_s16(reinterpret_cast(output.ptr()), a1_0); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, a2_0); + }, + input1, input2, output); +} + +void sub_saturate_U8_U8_S16(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) +{ + Iterator input1(in1, window); + Iterator input2(in2, window); + Iterator output(out, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t av_0 = vld1q_u8(input1.ptr()); + const uint8x16_t bv_0 = vld1q_u8(input2.ptr()); + + const int16x8_t a1_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(av_0))), + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bv_0)))); + const int16x8_t a2_0 = vqsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(av_0))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bv_0)))); + + vst1q_s16(reinterpret_cast(output.ptr()), a1_0); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, a2_0); + }, + input1, input2, output); +} +} // namespace + +NEArithmeticSubtractionKernel::NEArithmeticSubtractionKernel() + : _func(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void NEArithmeticSubtractionKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + + set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); + + if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16) + { + set_format_if_unknown(*output->info(), Format::S16); + } + else if(input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) + { + set_format_if_unknown(*output->info(), Format::F32); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + + static std::map map_function = + { + { "sub_wrap_U8_U8_U8", &sub_wrap_U8_U8_U8 }, + { "sub_wrap_U8_U8_S16", &sub_wrap_U8_U8_S16 }, + { "sub_saturate_U8_U8_U8", &sub_saturate_U8_U8_U8 }, + { "sub_saturate_U8_U8_S16", &sub_saturate_U8_U8_S16 }, + { "sub_wrap_U8_S16_S16", &sub_wrap_U8_S16_S16 }, + { "sub_wrap_S16_U8_S16", &sub_wrap_S16_U8_S16 }, + { "sub_saturate_U8_S16_S16", &sub_saturate_U8_S16_S16 }, + { "sub_saturate_S16_U8_S16", &sub_saturate_S16_U8_S16 }, + { "sub_wrap_S16_S16_S16", &sub_wrap_S16_S16_S16 }, + { "sub_saturate_S16_S16_S16", &sub_saturate_S16_S16_S16 }, + { "sub_wrap_F32_F32_F32", &sub_F32_F32_F32 }, + { "sub_saturate_F32_F32_F32", &sub_F32_F32_F32 }, + }; + + _input1 = input1; + _input2 = input2; + _output = output; + + std::string function_to_call("sub_"); + function_to_call += policy == ConvertPolicy::WRAP ? "wrap_" : "saturate_"; + function_to_call += string_from_data_type(input1->info()->data_type()) + "_"; + function_to_call += string_from_data_type(input2->info()->data_type()) + "_"; + function_to_call += string_from_data_type(output->info()->data_type()); + + auto it = map_function.find(function_to_call); + + if(it != map_function.end()) + { + _func = it->second; + } + else + { + ARM_COMPUTE_ERROR("You called subtract with the wrong image formats"); + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEArithmeticSubtractionKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input1, _input2, _output, window); +} diff --git a/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp new file mode 100644 index 0000000000..9a216aecde --- /dev/null +++ b/src/core/NEON/kernels/NEBatchNormalizationLayerKernel.cpp @@ -0,0 +1,187 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NEBatchNormalizationLayerKernel::NEBatchNormalizationLayerKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _mean(nullptr), _var(nullptr), _gamma(nullptr), _beta(nullptr), _epsilon() +{ +} + +void batch_normalization_q8(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window) +{ + Iterator input(in, window); + Iterator output(out, window); + + // Hold information about the current feature map we are iterating. + // Only compute denominator and NEON vectors once per feature map. + int slice = -1; + + int fixed_point_position = in->info()->fixed_point_position(); + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))); + const auto input_beta = reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))); + + qint8x16_t mean_vec = vdupq_n_qs8(0); + qint8x16_t var_vec = vdupq_n_qs8(0); + qint8x16_t gamma_vec = vdupq_n_qs8(0); + qint8x16_t beta_vec = vdupq_n_qs8(0); + qint8x16_t denominator = vdupq_n_qs8(0); + const qint8x16_t epsilon_vec = vdupq_n_qs8(scvt_qs8_f32(epsilon, fixed_point_position)); + execute_window_loop(window, [&](const Coordinates & id) + { + if(slice != id.z()) + { + // Conctruct vectors + mean_vec = vdupq_n_qs8(*(input_mean + id.z())); + var_vec = vdupq_n_qs8(*(input_var + id.z())); + gamma_vec = vdupq_n_qs8(*(input_gamma + id.z())); + beta_vec = vdupq_n_qs8(*(input_beta + id.z())); + + // Calculate denominator + denominator = vqinvsqrtq_qs8(vqaddq_qs8(var_vec, epsilon_vec), fixed_point_position); + slice = id.z(); + } + + // Calculate x bar and store results + const qint8x16_t numerator = vqsubq_qs8(vld1q_qs8(reinterpret_cast(input.ptr())), mean_vec); + const qint8x16_t x_bar = vqmulq_qs8(numerator, denominator, fixed_point_position); + vst1q_qs8(reinterpret_cast(output.ptr()), vqmlaq_qs8(beta_vec, x_bar, gamma_vec, fixed_point_position)); + }, + input, output); +} + +void batch_normalization_fp32(const ITensor *in, ITensor *out, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window) +{ + Iterator input(in, window); + Iterator output(out, window); + + // Hold information about the current feature map we are iterating. + // Only compute denominator and NEON vectors once per feature map. + int slice = -1; + + const auto input_mean = reinterpret_cast(mean->ptr_to_element(Coordinates(0, 0))); + const auto input_var = reinterpret_cast(var->ptr_to_element(Coordinates(0, 0))); + const auto input_gamma = reinterpret_cast(gamma->ptr_to_element(Coordinates(0, 0))); + const auto input_beta = reinterpret_cast(beta->ptr_to_element(Coordinates(0, 0))); + + float32x4_t mean_vec = vdupq_n_f32(0.0); + float32x4_t var_vec = vdupq_n_f32(0.0); + float32x4_t gamma_vec = vdupq_n_f32(0.0); + float32x4_t beta_vec = vdupq_n_f32(0.0); + float32x4_t denominator = vdupq_n_f32(0.0); + const float32x4_t epsilon_vec = vdupq_n_f32(epsilon); + execute_window_loop(window, [&](const Coordinates & id) + { + if(slice != id.z()) + { + // Conctruct vectors + mean_vec = vdupq_n_f32(*(input_mean + id.z())); + var_vec = vdupq_n_f32(*(input_var + id.z())); + gamma_vec = vdupq_n_f32(*(input_gamma + id.z())); + beta_vec = vdupq_n_f32(*(input_beta + id.z())); + + // Calculate denominator + denominator = vinvsqrtq_f32(vaddq_f32(var_vec, epsilon_vec)); + slice = id.z(); + } + + // Calculate x bar and store results + const float32x4_t numerator = vsubq_f32(vld1q_f32(reinterpret_cast(input.ptr())), mean_vec); + const float32x4_t x_bar = vmulq_f32(numerator, denominator); + vst1q_f32(reinterpret_cast(output.ptr()), vmlaq_f32(beta_vec, x_bar, gamma_vec)); + }, + input, output); +} + +void NEBatchNormalizationLayerKernel::configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mean, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(var, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gamma, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(beta, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, var); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, beta); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(mean, gamma); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + _input = input; + _output = output; + _mean = mean; + _var = var; + _gamma = gamma; + _beta = beta; + _epsilon = epsilon; + + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->data_type()) + { + case DataType::QS8: + _func = &batch_normalization_q8; + num_elems_processed_per_iteration = 16; + break; + case DataType::F32: + _func = &batch_normalization_fp32; + num_elems_processed_per_iteration = 4; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEBatchNormalizationLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _output, _mean, _var, _beta, _gamma, _epsilon, window); +} diff --git a/src/core/NEON/kernels/NEBitwiseAndKernel.cpp b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp new file mode 100644 index 0000000000..e8e448e455 --- /dev/null +++ b/src/core/NEON/kernels/NEBitwiseAndKernel.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +inline void bitwise_and_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +{ + const uint8x16_t val1 = vld1q_u8(input1); + const uint8x16_t val2 = vld1q_u8(input2); + + vst1q_u8(output, vandq_u8(val1, val2)); +} +} // namespace + +NEBitwiseAndKernel::NEBitwiseAndKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void NEBitwiseAndKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + + set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); + + set_format_if_unknown(*output->info(), Format::U8); + set_format_if_unknown(*input1->info(), Format::U8); + set_format_if_unknown(*input2->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); + + _input1 = input1; + _input2 = input2; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); + + const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEBitwiseAndKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + Iterator input1(_input1, window); + Iterator input2(_input2, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + bitwise_and_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); + }, + input1, input2, output); +} diff --git a/src/core/NEON/kernels/NEBitwiseNotKernel.cpp b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp new file mode 100644 index 0000000000..bf75592677 --- /dev/null +++ b/src/core/NEON/kernels/NEBitwiseNotKernel.cpp @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +inline void bitwise_not_U8_U8(const uint8_t *__restrict input, uint8_t *__restrict output) +{ + const uint8x16_t val0 = vld1q_u8(input); + + vst1q_u8(output, vmvnq_u8(val0)); +} +} // namespace + +NEBitwiseNotKernel::NEBitwiseNotKernel() + : _input(nullptr), _output(nullptr) +{ +} + +void NEBitwiseNotKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + set_format_if_unknown(*output->info(), Format::U8); + set_format_if_unknown(*input->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEBitwiseNotKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + Iterator input(_input, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + bitwise_not_U8_U8(input.ptr(), output.ptr()); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NEBitwiseOrKernel.cpp b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp new file mode 100644 index 0000000000..f184be2f26 --- /dev/null +++ b/src/core/NEON/kernels/NEBitwiseOrKernel.cpp @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +inline void bitwise_or_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +{ + const uint8x16_t val1 = vld1q_u8(input1); + const uint8x16_t val2 = vld1q_u8(input2); + + vst1q_u8(output, vorrq_u8(val1, val2)); +} +} // namespace + +NEBitwiseOrKernel::NEBitwiseOrKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void NEBitwiseOrKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + + set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); + + set_format_if_unknown(*output->info(), Format::U8); + set_format_if_unknown(*input1->info(), Format::U8); + set_format_if_unknown(*input2->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); + + _input1 = input1; + _input2 = input2; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); + + const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEBitwiseOrKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + Iterator input1(_input1, window); + Iterator input2(_input2, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + bitwise_or_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); + }, + input1, input2, output); +} diff --git a/src/core/NEON/kernels/NEBitwiseXorKernel.cpp b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp new file mode 100644 index 0000000000..c4fb4c0d03 --- /dev/null +++ b/src/core/NEON/kernels/NEBitwiseXorKernel.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +inline void bitwise_xor_U8_U8_U8(const uint8_t *__restrict input1, const uint8_t *__restrict input2, uint8_t *__restrict output) +{ + const uint8x16_t val1 = vld1q_u8(input1); + const uint8x16_t val2 = vld1q_u8(input2); + + vst1q_u8(output, veorq_u8(val1, val2)); +} +} // namespace + +NEBitwiseXorKernel::NEBitwiseXorKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr) +{ +} + +void NEBitwiseXorKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + + set_shape_if_empty(*output->info(), input1->info()->tensor_shape()); + + set_format_if_unknown(*output->info(), Format::U8); + set_format_if_unknown(*input1->info(), Format::U8); + set_format_if_unknown(*input2->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); + + _input1 = input1; + _input2 = input2; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), output_access); + + const ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEBitwiseXorKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + Iterator input1(_input1, window); + Iterator input2(_input2, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + bitwise_xor_U8_U8_U8(input1.ptr(), input2.ptr(), output.ptr()); + }, + input1, input2, output); +} diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp new file mode 100644 index 0000000000..d7e6d73cd7 --- /dev/null +++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp @@ -0,0 +1,220 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Validate.h" +#include + +using namespace arm_compute; + +#ifdef ARM_COMPUTE_ENABLE_FP16 +void NEBox3x3FP16Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator input(_input, window); + Iterator output(_output, window); + + unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1)); + unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0)); + unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1)); + + const float16x8_t oneovernine = vdupq_n_f16(1.0f / 9.0f); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const float16x8x2_t top_f16 = + { + { + vcvtq_f16_u16(vmovl_u8(vget_low_u8(top_data))), + vcvtq_f16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + + const float16x8x2_t mid_f16 = + { + { + vcvtq_f16_u16(vmovl_u8(vget_low_u8(mid_data))), + vcvtq_f16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + + const float16x8x2_t bot_f16 = + { + { + vcvtq_f16_u16(vmovl_u8(vget_low_u8(bot_data))), + vcvtq_f16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //top left + float16x8_t out = top_f16.val[0]; + //top mid + out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 1)); + //top right + out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 2)); + //mid left + out = vaddq_f16(out, mid_f16.val[0]); + //mid mid + out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 1)); + //mid right + out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 2)); + //bot left + out = vaddq_f16(out, bot_f16.val[0]); + //bot mid + out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 1)); + //bot right + out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 2)); + + out = vmulq_f16(out, oneovernine); + + vst1_u8(output.ptr(), vqmovun_s16(vcvtq_s16_f16(out))); + }, + input, output); +} +#endif + +BorderSize NEBox3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void NEBox3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + set_format_if_unknown(*input->info(), Format::U8); + set_format_if_unknown(*output->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + constexpr int rect_offset_xy = -1; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration), output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEBox3x3Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator input(_input, window); + Iterator output(_output, window); + + unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1)); + unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0)); + unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1)); + + const float32x4_t oneovernine = vdupq_n_f32(1.0f / 9.0f); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //top left + int16x8_t out = top_s16.val[0]; + //top mid + out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1)); + //top right + out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //mid left + out = vaddq_s16(out, mid_s16.val[0]); + //mid mid + out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1)); + //mid right + out = vaddq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2)); + //bot left + out = vaddq_s16(out, bot_s16.val[0]); + //bot mid + out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1)); + //bot right + out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + float32x4_t outfloathigh = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out))); + float32x4_t outfloatlow = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out))); + + outfloathigh = vmulq_f32(outfloathigh, oneovernine); + outfloatlow = vmulq_f32(outfloatlow, oneovernine); + + out = vcombine_s16(vqmovn_s32(vcvtq_s32_f32(outfloatlow)), + vqmovn_s32(vcvtq_s32_f32(outfloathigh))); + + vst1_u8(output.ptr(), vqmovun_s16(out)); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NECannyEdgeKernel.cpp b/src/core/NEON/kernels/NECannyEdgeKernel.cpp new file mode 100644 index 0000000000..85a2cd5855 --- /dev/null +++ b/src/core/NEON/kernels/NECannyEdgeKernel.cpp @@ -0,0 +1,1856 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +constexpr int NO_EDGE = 0; +constexpr int EDGE = 255; +constexpr int MAYBE = 127; +} // namespace + +#ifdef ARM_COMPUTE_ENABLE_FP16 +namespace fp16 +{ +inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy) +{ + // Constant use for evaluating score1 and score3 + static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f); + static const float32x4_t zero = vdupq_n_f32(0.0f); + static const float32x4_t one = vdupq_n_f32(1.0f); + static const float32x4_t two = vdupq_n_f32(2.0f); + static const float32x4_t three = vdupq_n_f32(3.0f); + + // Score0: (1, 0) + const float32x4x2_t score0 = + { + vabsq_f32(gx.val[0]), + vabsq_f32(gx.val[1]) + }; + + // Score2: ( 0, 1 ) + const float32x4x2_t score2 = + { + vabsq_f32(gy.val[0]), + vabsq_f32(gy.val[1]) + }; + + // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 ) + float32x4x2_t score1 = + { + vmulq_f32(gy.val[0], const45), + vmulq_f32(gy.val[1], const45) + }; + + float32x4x2_t score3 = score1; + + score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45); + score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45); + score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45); + score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45); + + score1.val[0] = vabsq_f32(score1.val[0]); + score1.val[1] = vabsq_f32(score1.val[1]); + score3.val[0] = vabsq_f32(score3.val[0]); + score3.val[1] = vabsq_f32(score3.val[1]); + + float32x4x2_t phase = + { + zero, + zero + }; + + float32x4x2_t old_score = score0; + + // score1 > old_score? + uint32x4x2_t mask = + { + vcgtq_f32(score1.val[0], old_score.val[0]), + vcgtq_f32(score1.val[1], old_score.val[1]) + }; + + phase.val[0] = vbslq_f32(mask.val[0], one, phase.val[0]); + phase.val[1] = vbslq_f32(mask.val[1], one, phase.val[1]); + old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]); + old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]); + + // score2 > old_score? + mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]); + mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]); + + phase.val[0] = vbslq_f32(mask.val[0], two, phase.val[0]); + phase.val[1] = vbslq_f32(mask.val[1], two, phase.val[1]); + old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]); + old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]); + + // score3 > old_score? + mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]); + mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]); + + phase.val[0] = vbslq_f32(mask.val[0], three, phase.val[0]); + phase.val[1] = vbslq_f32(mask.val[1], three, phase.val[1]); + old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]); + old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]); + + // Convert from float32x4_t to uint8x8_t + return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])), + vmovn_u32(vcvtq_u32_f32(phase.val[1])))); +} + +inline uint8x8_t phase_quantization(float16x8_t gx, float16x8_t gy) +{ + // Constant use for evaluating score1 and score3 + static const float16x8_t const45 = vdupq_n_f16(0.70710678118655f); + static const float16x8_t zero = vdupq_n_f16(0.0f); + static const float16x8_t one = vdupq_n_f16(1.0f); + static const float16x8_t two = vdupq_n_f16(2.0f); + static const float16x8_t three = vdupq_n_f16(3.0f); + + // Score0: (1, 0) + const float16x8_t score0 = vabsq_f16(gx); + + // Score2: ( 0, 1 ) + const float16x8_t score2 = vabsq_f16(gy); + + // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 ) + float16x8_t score1 = vmulq_f16(gy, const45); + float16x8_t score3 = score1; + + score1 = vfmaq_f16(score1, gx, const45); + score3 = vfmsq_f16(score3, gx, const45); + + score1 = vabsq_f16(score1); + score3 = vabsq_f16(score3); + + float16x8_t phase = zero; + float16x8_t old_score = score0; + + // score1 > old_score? + uint16x8_t mask = vcgtq_f16(score1, old_score); + + phase = vbslq_f16(mask, one, phase); + old_score = vbslq_f16(mask, score1, old_score); + + // score2 > old_score? + mask = vcgtq_f16(score2, old_score); + + phase = vbslq_f16(mask, two, phase); + old_score = vbslq_f16(mask, score2, old_score); + + // score3 > old_score? + mask = vcgtq_f16(score3, old_score); + + phase = vbslq_f16(mask, three, phase); + + // Convert from float16x8_t to uint8x8_t + return vmovn_u16(vcvtq_u16_f16(phase)); +} + +/** Computes the gradient phase if gradient_size = 3 or 5. The output is quantized. + * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135° + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return quantized phase for 8 pixels + */ +inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy) +{ + return phase_quantization(vcvtq_f16_s16(gx), vcvtq_f16_s16(gy)); +} + +/** Computes the gradient phase if gradient_size = 7. The output is quantized. + * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135° + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return quantized phase for 8 pixels + */ +inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy) +{ + // Convert to float + const float32x4x2_t gx_f32 = + { + vcvtq_f32_s32(gx.val[0]), + vcvtq_f32_s32(gx.val[1]) + }; + + const float32x4x2_t gy_f32 = + { + vcvtq_f32_s32(gy.val[0]), + vcvtq_f32_s32(gy.val[1]) + }; + + return phase_quantization(gx_f32, gy_f32); +} + +/** Computes the magnitude using the L1-norm type if gradient_size = 3 or 5 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy) +{ + return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)), + vreinterpretq_u16_s16(vabsq_s16(gy))); +} + +/** Computes the magnitude using the L1-norm type if gradient_size = 7 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy) +{ + const uint32x4x2_t gx_abs = + { + vreinterpretq_u32_s32(vabsq_s32(gx.val[0])), + vreinterpretq_u32_s32(vabsq_s32(gx.val[1])) + }; + + const uint32x4x2_t gy_abs = + { + vreinterpretq_u32_s32(vabsq_s32(gy.val[0])), + vreinterpretq_u32_s32(vabsq_s32(gy.val[1])) + }; + + const uint32x4x2_t out = + { + vaddq_u32(gx_abs.val[0], gy_abs.val[0]), + vaddq_u32(gx_abs.val[1], gy_abs.val[1]) + }; + + return out; +} + +inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy) +{ + // x^2 ... + float32x4x2_t mag = + { + vmulq_f32(gx.val[0], gx.val[0]), + vmulq_f32(gx.val[1], gx.val[1]) + }; + + // ... + y^2 + mag.val[0] = vmlaq_f32(mag.val[0], gy.val[0], gy.val[0]); + mag.val[1] = vmlaq_f32(mag.val[1], gy.val[1], gy.val[1]); + + // sqrt(...) + mag.val[0] = vmulq_f32(vrsqrteq_f32(mag.val[0]), mag.val[0]); + mag.val[1] = vmulq_f32(vrsqrteq_f32(mag.val[1]), mag.val[1]); + + return mag; +} + +inline float16x8_t mag_l2(float16x8_t gx, float16x8_t gy) +{ + // x^2 ... + float16x8_t mag = vmulq_f16(gx, gx); + + // ... + y^2 + mag = vfmaq_f16(mag, gy, gy); + + // sqrt(...) + mag = vmulq_f16(vrsqrteq_f16(mag), mag); + + return mag; +} + +/** Computes the magnitude using L2-norm if gradient_size = 3 or 5 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy) +{ + /* Compute magnitude using L2 normalization */ + const float16x8_t gx2 = vcvtq_f16_s16(gx); + const float16x8_t gy2 = vcvtq_f16_s16(gy); + const float16x8_t mag = mag_l2(gx2, gy2); + + /* Store magnitude - Convert to uint16x8 */ + return vcvtq_u16_f16(mag); +} + +/** Computes the magnitude using L2-norm if gradient_size = 7 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy) +{ + // Compute magnitude using L2 normalization + float32x4x2_t gx2 = + { + vcvtq_f32_s32(gx.val[0]), + vcvtq_f32_s32(gx.val[1]) + }; + + float32x4x2_t gy2 = + { + vcvtq_f32_s32(gy.val[0]), + vcvtq_f32_s32(gy.val[1]) + }; + + const float32x4x2_t mag = mag_l2(gx2, gy2); + const uint32x4x2_t mag32 = + { + vcvtq_u32_f32(mag.val[0]), + vcvtq_u32_f32(mag.val[1]) + }; + + return mag32; +} + +/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm + * + * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S16 + * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S16 + * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16 + * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8 + */ +void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr) +{ + const auto in1 = static_cast(in1_ptr); + const auto in2 = static_cast(in2_ptr); + const auto out1 = static_cast(out1_ptr); + const auto out2 = static_cast(out2_ptr); + + const int16x8x4_t gx = + { + vld1q_s16(in1), + vld1q_s16(in1 + 8), + vld1q_s16(in1 + 16), + vld1q_s16(in1 + 24) + }; + + const int16x8x4_t gy = + { + vld1q_s16(in2), + vld1q_s16(in2 + 8), + vld1q_s16(in2 + 16), + vld1q_s16(in2 + 24) + }; + + // Compute and store phase + vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0])); + vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1])); + vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2])); + vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3])); + + // Compute ans store magnitude using L1 normalization + vst1q_u16(out1 + 0, mag_l1_S16_S16(gx.val[0], gy.val[0])); + vst1q_u16(out1 + 8, mag_l1_S16_S16(gx.val[1], gy.val[1])); + vst1q_u16(out1 + 16, mag_l1_S16_S16(gx.val[2], gy.val[2])); + vst1q_u16(out1 + 24, mag_l1_S16_S16(gx.val[3], gy.val[3])); +} + +/** Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm + * + * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S16 + * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S16 + * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U16 + * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8 + */ +void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr) +{ + const auto in1 = static_cast(in1_ptr); + const auto in2 = static_cast(in2_ptr); + const auto out1 = static_cast(out1_ptr); + const auto out2 = static_cast(out2_ptr); + + const int16x8x4_t gx = + { + vld1q_s16(in1), + vld1q_s16(in1 + 8), + vld1q_s16(in1 + 16), + vld1q_s16(in1 + 24) + }; + + const int16x8x4_t gy = + { + vld1q_s16(in2), + vld1q_s16(in2 + 8), + vld1q_s16(in2 + 16), + vld1q_s16(in2 + 24) + }; + + // Compute and store phase + vst1_u8(out2 + 0, phase_quantization_S16_S16(gx.val[0], gy.val[0])); + vst1_u8(out2 + 8, phase_quantization_S16_S16(gx.val[1], gy.val[1])); + vst1_u8(out2 + 16, phase_quantization_S16_S16(gx.val[2], gy.val[2])); + vst1_u8(out2 + 24, phase_quantization_S16_S16(gx.val[3], gy.val[3])); + + // Compute and store magnitude using L2 normalization + vst1q_u16(out1 + 0, mag_l2_S16_S16(gx.val[0], gy.val[0])); + vst1q_u16(out1 + 8, mag_l2_S16_S16(gx.val[1], gy.val[1])); + vst1q_u16(out1 + 16, mag_l2_S16_S16(gx.val[2], gy.val[2])); + vst1q_u16(out1 + 24, mag_l2_S16_S16(gx.val[3], gy.val[3])); +} + +/** Gradient function used when the gradient size = 7 and when the norm_type = L1-norm + * + * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S32 + * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S32 + * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32 + * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8 + */ +void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr) +{ + auto in1 = static_cast(in1_ptr); + auto in2 = static_cast(in2_ptr); + auto out1 = static_cast(out1_ptr); + auto out2 = static_cast(out2_ptr); + + // Process low and high part + for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16) + { + const int32x4x2_t gx0 = + { + vld1q_s32(in1 + 0), + vld1q_s32(in1 + 4) + }; + + const int32x4x2_t gx1 = + { + vld1q_s32(in1 + 8), + vld1q_s32(in1 + 12) + }; + + const int32x4x2_t gy0 = + { + vld1q_s32(in2 + 0), + vld1q_s32(in2 + 4) + }; + + const int32x4x2_t gy1 = + { + vld1q_s32(in2 + 8), + vld1q_s32(in2 + 12) + }; + + // Compute and store phase + vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0)); + vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1)); + + // Compute magnitude using L1 normalization + const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0); + const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1); + + // Store magnitude + vst1q_u32(out1 + 0, mag0.val[0]); + vst1q_u32(out1 + 4, mag0.val[1]); + vst1q_u32(out1 + 8, mag1.val[0]); + vst1q_u32(out1 + 12, mag1.val[1]); + } +} + +/** Gradient function used when the gradient size = 7 and when the norm_type = L2-norm + * + * @param[in] in1_ptr Pointer to source image. Gx image. Data type supported S32 + * @param[in] in2_ptr Pointer to source image. Gy image. Data type supported S32 + * @param[out] out1_ptr Pointer to destination image. Magnitude. Data type supported U32 + * @param[out] out2_ptr Pointer to destination image. Quantized phase. Data type supported U8 + */ +void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out1_ptr, void *__restrict out2_ptr) +{ + auto in1 = static_cast(in1_ptr); + auto in2 = static_cast(in2_ptr); + auto out1 = static_cast(out1_ptr); + auto out2 = static_cast(out2_ptr); + + // Process low and high part + for(size_t i = 0; i < 2; ++i, in1 += 16, in2 += 16, out1 += 16, out2 += 16) + { + const int32x4x2_t gx0 = + { + vld1q_s32(in1 + 0), + vld1q_s32(in1 + 4) + }; + + const int32x4x2_t gx1 = + { + vld1q_s32(in1 + 8), + vld1q_s32(in1 + 12) + }; + + const int32x4x2_t gy0 = + { + vld1q_s32(in2 + 0), + vld1q_s32(in2 + 4) + }; + + const int32x4x2_t gy1 = + { + vld1q_s32(in2 + 8), + vld1q_s32(in2 + 12) + }; + + // Compute and store phase + vst1_u8(out2 + 0, phase_quantization_S32_S32(gx0, gy0)); + vst1_u8(out2 + 8, phase_quantization_S32_S32(gx1, gy1)); + + // Compute magnitude using L2 normalization + const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0); + const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1); + + // Store magnitude + vst1q_u32(out1 + 0, mag0.val[0]); + vst1q_u32(out1 + 4, mag0.val[1]); + vst1q_u32(out1 + 8, mag1.val[0]); + vst1q_u32(out1 + 12, mag1.val[1]); + } +} + +inline uint16x4_t non_max_U32_helper(const uint32_t *in, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr) +{ + // Phase for 4 pixel + const uint32x4_t pc32 = vmovl_u16(pc); + + // Get magnitude for 4 pixel + uint32x4_t mc = vld1q_u32(in); + + // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135° + // 0 degree + const uint32x4_t mk0_0 = vld1q_u32(in - 1); + const uint32x4_t mk0_1 = vld1q_u32(in + 1); + uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0)); + mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_0)); + mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_1)); + + // 45 degree + const uint32x4_t mk45_0 = vld1q_u32(in - stride_mag - 1); + const uint32x4_t mk45_1 = vld1q_u32(in + stride_mag + 1); + uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1)); + mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_0)); + mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_1)); + + // 90 degree + const uint32x4_t mk90_0 = vld1q_u32(in - stride_mag); + const uint32x4_t mk90_1 = vld1q_u32(in + stride_mag); + uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2)); + mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_0)); + mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_1)); + + // 135 degree + const uint32x4_t mk135_0 = vld1q_u32(in - stride_mag + 1); + const uint32x4_t mk135_1 = vld1q_u32(in + stride_mag - 1); + uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3)); + mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_0)); + mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_1)); + + // Merge masks + mask0 = vorrq_u32(mask0, mask1); + mask2 = vorrq_u32(mask2, mask3); + mask0 = vorrq_u32(mask0, mask2); + + mc = vbslq_u32(mask0, mc, vdupq_n_u32(0)); + + // mc > upper_thr + mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr)); + + // mc <= lower_thr + mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr)); + + // mc <= upper_thr && mc > lower_thr + mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr)); + mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr))); + + mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc); + mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc); + mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc); + + return vmovn_u32(mc); +} + +/** Computes edge tracing when is called by edge_trace_U8_U8 recursively + * + * @param[in] in Pointer to source image. Data type supported U8 + * @param[out] out Pointer to destination image. Data type supported U8 + * @param[in] in_stride Stride of the input image + * @param[in] out_stride Stride of the output image + */ +void edge_trace_recursive_U8_U8(uint8_t *__restrict in, uint8_t *__restrict out, const int32_t in_stride, const int32_t out_stride) +{ + // Look for MAYBE pixels in 8 directions + *out = EDGE; + + // (-1, 0) + uint8_t pixel = *(in - 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(in - 1) = EDGE; + + edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride); + } + + // (+1, 0) + pixel = *(in + 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(in + 1) = EDGE; + + edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride); + } + + in -= in_stride; + out -= out_stride; + + // (-1, -1) + pixel = *(in - 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(in - 1) = EDGE; + + edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride); + } + + // (0, -1) + pixel = *in; + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *in = EDGE; + + edge_trace_recursive_U8_U8(in, out, in_stride, out_stride); + } + + // (+1, -1) + pixel = *(in + 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(in + 1) = EDGE; + + edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride); + } + + in += in_stride * 2; + out += out_stride * 2; + + // (-1, +1) + pixel = *(in - 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(in - 1) = EDGE; + + edge_trace_recursive_U8_U8(in - 1, out - 1, in_stride, out_stride); + } + + // (0, +1) + pixel = *in; + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *in = EDGE; + + edge_trace_recursive_U8_U8(in, out, in_stride, out_stride); + } + + // (+1, +1) + pixel = *(in + 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(in + 1) = EDGE; + + edge_trace_recursive_U8_U8(in + 1, out + 1, in_stride, out_stride); + } +} +} // namespace fp16 + +void NEGradientFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase); + + set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape()); + set_shape_if_empty(*phase->info(), gx->info()->tensor_shape()); + + Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32; + set_format_if_unknown(*magnitude->info(), magnitude_format); + set_format_if_unknown(*phase->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy); + ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy"); + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + if(_gx->info()->data_type() == DataType::S16) + { + if(norm_type == 1) + { + _func = &fp16::mag_phase_l1norm_S16_S16_U16_U8; + } + else + { + _func = &fp16::mag_phase_l2norm_S16_S16_U16_U8; + } + } + else + { + if(norm_type == 1) + { + _func = &fp16::mag_phase_l1norm_S32_S32_U32_U8; + } + else + { + _func = &fp16::mag_phase_l2norm_S32_S32_U32_U8; + } + } + + constexpr unsigned int num_elems_processed_per_iteration = 32; + + // Configure kernel window + Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access); + + mag_access.set_valid_region(win, _gx->info()->valid_region()); + phase_access.set_valid_region(win, _gx->info()->valid_region()); + + INEKernel::configure(win); +} +#endif + +namespace +{ +inline uint8x8_t phase_quantization(const float32x4x2_t &gx, const float32x4x2_t &gy) +{ + // Constant use for evaluating score1 and score3 + static const float32x4_t const45 = vdupq_n_f32(0.70710678118655f); + static const float32x4_t zero = vdupq_n_f32(0.0f); + static const float32x4_t one = vdupq_n_f32(1.0f); + static const float32x4_t two = vdupq_n_f32(2.0f); + static const float32x4_t three = vdupq_n_f32(3.0f); + + // Score0: (1, 0) + const float32x4x2_t score0 = + { + { + vabsq_f32(gx.val[0]), + vabsq_f32(gx.val[1]) + } + }; + + // Score2: ( 0, 1 ) + const float32x4x2_t score2 = + { + { + vabsq_f32(gy.val[0]), + vabsq_f32(gy.val[1]) + } + }; + + // Score1 and Score3: ( sqrt(2) / 2, sqrt(2) / 2 ) - ( -sqrt(2) / 2, sqrt(2) / 2 ) + float32x4x2_t score1 = + { + { + vmulq_f32(gy.val[0], const45), + vmulq_f32(gy.val[1], const45) + } + }; + + float32x4x2_t score3 = score1; + + score1.val[0] = vmlaq_f32(score1.val[0], gx.val[0], const45); + score1.val[1] = vmlaq_f32(score1.val[1], gx.val[1], const45); + score3.val[0] = vmlsq_f32(score3.val[0], gx.val[0], const45); + score3.val[1] = vmlsq_f32(score3.val[1], gx.val[1], const45); + + score1.val[0] = vabsq_f32(score1.val[0]); + score1.val[1] = vabsq_f32(score1.val[1]); + score3.val[0] = vabsq_f32(score3.val[0]); + score3.val[1] = vabsq_f32(score3.val[1]); + + float32x4x2_t phase = + { + { + zero, + zero + } + }; + + float32x4x2_t old_score = score0; + + // score1 > old_score? + uint32x4x2_t mask = + { + { + vcgtq_f32(score1.val[0], old_score.val[0]), + vcgtq_f32(score1.val[1], old_score.val[1]) + } + }; + + phase.val[0] = vbslq_f32(mask.val[0], one, phase.val[0]); + phase.val[1] = vbslq_f32(mask.val[1], one, phase.val[1]); + old_score.val[0] = vbslq_f32(mask.val[0], score1.val[0], old_score.val[0]); + old_score.val[1] = vbslq_f32(mask.val[1], score1.val[1], old_score.val[1]); + + // score2 > old_score? + mask.val[0] = vcgtq_f32(score2.val[0], old_score.val[0]); + mask.val[1] = vcgtq_f32(score2.val[1], old_score.val[1]); + + phase.val[0] = vbslq_f32(mask.val[0], two, phase.val[0]); + phase.val[1] = vbslq_f32(mask.val[1], two, phase.val[1]); + old_score.val[0] = vbslq_f32(mask.val[0], score2.val[0], old_score.val[0]); + old_score.val[1] = vbslq_f32(mask.val[1], score2.val[1], old_score.val[1]); + + // score3 > old_score? + mask.val[0] = vcgtq_f32(score3.val[0], old_score.val[0]); + mask.val[1] = vcgtq_f32(score3.val[1], old_score.val[1]); + + phase.val[0] = vbslq_f32(mask.val[0], three, phase.val[0]); + phase.val[1] = vbslq_f32(mask.val[1], three, phase.val[1]); + old_score.val[0] = vbslq_f32(mask.val[0], score3.val[0], old_score.val[0]); + old_score.val[1] = vbslq_f32(mask.val[1], score3.val[1], old_score.val[1]); + + // Convert from float32x4_t to uint8x8_t + return vmovn_u16(vcombine_u16(vmovn_u32(vcvtq_u32_f32(phase.val[0])), + vmovn_u32(vcvtq_u32_f32(phase.val[1])))); +} + +/* Computes the gradient phase if gradient_size = 3 or 5. The output is quantized. + * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135° + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return quantized phase for 8 pixels + */ +inline uint8x8_t phase_quantization_S16_S16(int16x8_t gx, int16x8_t gy) +{ + // Convert to float + const float32x4x2_t gx_f32 = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx))) + } + }; + + const float32x4x2_t gy_f32 = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy))) + } + }; + + return phase_quantization(gx_f32, gy_f32); +} + +/* Computes the gradient phase if gradient_size = 7. The output is quantized. + * 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135° + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return quantized phase for 8 pixels + */ +inline uint8x8_t phase_quantization_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy) +{ + // Convert to float + const float32x4x2_t gx_f32 = + { + { + vcvtq_f32_s32(gx.val[0]), + vcvtq_f32_s32(gx.val[1]) + } + }; + + const float32x4x2_t gy_f32 = + { + { + vcvtq_f32_s32(gy.val[0]), + vcvtq_f32_s32(gy.val[1]) + } + }; + + return phase_quantization(gx_f32, gy_f32); +} + +/* Computes the magnitude using the L1-norm type if gradient_size = 3 or 5 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint16x8_t mag_l1_S16_S16(int16x8_t gx, int16x8_t gy) +{ + return vaddq_u16(vreinterpretq_u16_s16(vabsq_s16(gx)), + vreinterpretq_u16_s16(vabsq_s16(gy))); +} + +/* Computes the magnitude using the L1-norm type if gradient_size = 7 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint32x4x2_t mag_l1_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy) +{ + const uint32x4x2_t gx_abs = + { + { + vreinterpretq_u32_s32(vabsq_s32(gx.val[0])), + vreinterpretq_u32_s32(vabsq_s32(gx.val[1])) + } + }; + + const uint32x4x2_t gy_abs = + { + { + vreinterpretq_u32_s32(vabsq_s32(gy.val[0])), + vreinterpretq_u32_s32(vabsq_s32(gy.val[1])) + } + }; + + const uint32x4x2_t output = + { + { + vaddq_u32(gx_abs.val[0], gy_abs.val[0]), + vaddq_u32(gx_abs.val[1], gy_abs.val[1]) + } + }; + + return output; +} + +inline float32x4x2_t mag_l2(const float32x4x2_t &gx, const float32x4x2_t &gy) +{ + // x^2 ... + float32x4x2_t magnitude = + { + { + vmulq_f32(gx.val[0], gx.val[0]), + vmulq_f32(gx.val[1], gx.val[1]) + } + }; + + // ... + y^2 + magnitude.val[0] = vmlaq_f32(magnitude.val[0], gy.val[0], gy.val[0]); + magnitude.val[1] = vmlaq_f32(magnitude.val[1], gy.val[1], gy.val[1]); + + // sqrt(...) + magnitude.val[0] = vmulq_f32(vrsqrteq_f32(magnitude.val[0]), magnitude.val[0]); + magnitude.val[1] = vmulq_f32(vrsqrteq_f32(magnitude.val[1]), magnitude.val[1]); + + return magnitude; +} + +/* Computes the magnitude using L2-norm if gradient_size = 3 or 5 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint16x8_t mag_l2_S16_S16(int16x8_t gx, int16x8_t gy) +{ + // Compute magnitude using L2 normalization + const float32x4x2_t gx2 = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(gx))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(gx))) + } + }; + + const float32x4x2_t gy2 = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(gy))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(gy))) + } + }; + + const float32x4x2_t magnitude = mag_l2(gx2, gy2); + + // Store magnitude - Convert to uint16x8 + return vcombine_u16(vmovn_u32(vcvtq_u32_f32(magnitude.val[0])), + vmovn_u32(vcvtq_u32_f32(magnitude.val[1]))); +} + +/* Computes the magnitude using L2-norm if gradient_size = 7 + * + * @param[in] gx Gx component + * @param[in] gy Gy component + * + * @return magnitude for 8 pixels + */ +inline uint32x4x2_t mag_l2_S32_S32(const int32x4x2_t &gx, const int32x4x2_t &gy) +{ + // Compute magnitude using L2 normalization + float32x4x2_t gx2 = + { + { + vcvtq_f32_s32(gx.val[0]), + vcvtq_f32_s32(gx.val[1]) + } + }; + + float32x4x2_t gy2 = + { + { + vcvtq_f32_s32(gy.val[0]), + vcvtq_f32_s32(gy.val[1]) + } + }; + + const float32x4x2_t magnitude = mag_l2(gx2, gy2); + const uint32x4x2_t mag32 = + { + { + vcvtq_u32_f32(magnitude.val[0]), + vcvtq_u32_f32(magnitude.val[1]) + } + }; + + return mag32; +} + +/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L1-norm + * + * @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S16 + * @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S16 + * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16 + * @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8 + */ +void mag_phase_l1norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr) +{ + const auto gx = static_cast(gx_ptr); + const auto gy = static_cast(gy_ptr); + const auto magnitude = static_cast(magnitude_ptr); + const auto phase = static_cast(phase_ptr); + + const int16x8x4_t gx_val = + { + { + vld1q_s16(gx), + vld1q_s16(gx + 8), + vld1q_s16(gx + 16), + vld1q_s16(gx + 24) + } + }; + + const int16x8x4_t gy_val = + { + { + vld1q_s16(gy), + vld1q_s16(gy + 8), + vld1q_s16(gy + 16), + vld1q_s16(gy + 24) + } + }; + + // Compute and store phase + vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0])); + vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1])); + vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2])); + vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3])); + + // Compute ans store magnitude using L1 normalization + vst1q_u16(magnitude + 0, mag_l1_S16_S16(gx_val.val[0], gy_val.val[0])); + vst1q_u16(magnitude + 8, mag_l1_S16_S16(gx_val.val[1], gy_val.val[1])); + vst1q_u16(magnitude + 16, mag_l1_S16_S16(gx_val.val[2], gy_val.val[2])); + vst1q_u16(magnitude + 24, mag_l1_S16_S16(gx_val.val[3], gy_val.val[3])); +} + +/* Gradient function used when the gradient size = 3 or 5 and when the norm_type = L2-norm + * + * @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S16 + * @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S16 + * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U16 + * @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8 + */ +void mag_phase_l2norm_S16_S16_U16_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr) +{ + const auto gx = static_cast(gx_ptr); + const auto gy = static_cast(gy_ptr); + const auto magnitude = static_cast(magnitude_ptr); + const auto phase = static_cast(phase_ptr); + + const int16x8x4_t gx_val = + { + { + vld1q_s16(gx), + vld1q_s16(gx + 8), + vld1q_s16(gx + 16), + vld1q_s16(gx + 24) + } + }; + + const int16x8x4_t gy_val = + { + { + vld1q_s16(gy), + vld1q_s16(gy + 8), + vld1q_s16(gy + 16), + vld1q_s16(gy + 24) + } + }; + + // Compute and store phase + vst1_u8(phase + 0, phase_quantization_S16_S16(gx_val.val[0], gy_val.val[0])); + vst1_u8(phase + 8, phase_quantization_S16_S16(gx_val.val[1], gy_val.val[1])); + vst1_u8(phase + 16, phase_quantization_S16_S16(gx_val.val[2], gy_val.val[2])); + vst1_u8(phase + 24, phase_quantization_S16_S16(gx_val.val[3], gy_val.val[3])); + + // Compute and store magnitude using L2 normalization + vst1q_u16(magnitude + 0, mag_l2_S16_S16(gx_val.val[0], gy_val.val[0])); + vst1q_u16(magnitude + 8, mag_l2_S16_S16(gx_val.val[1], gy_val.val[1])); + vst1q_u16(magnitude + 16, mag_l2_S16_S16(gx_val.val[2], gy_val.val[2])); + vst1q_u16(magnitude + 24, mag_l2_S16_S16(gx_val.val[3], gy_val.val[3])); +} + +/* Gradient function used when the gradient size = 7 and when the norm_type = L1-norm + * + * @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S32 + * @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S32 + * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32 + * @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type support U8 + */ +void mag_phase_l1norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr) +{ + auto gx = static_cast(gx_ptr); + auto gy = static_cast(gy_ptr); + auto magnitude = static_cast(magnitude_ptr); + auto phase = static_cast(phase_ptr); + + // Process low and high part + for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16) + { + const int32x4x2_t gx0 = + { + { + vld1q_s32(gx + 0), + vld1q_s32(gx + 4) + } + }; + + const int32x4x2_t gx1 = + { + { + vld1q_s32(gx + 8), + vld1q_s32(gx + 12) + } + }; + + const int32x4x2_t gy0 = + { + { + vld1q_s32(gy + 0), + vld1q_s32(gy + 4) + } + }; + + const int32x4x2_t gy1 = + { + { + vld1q_s32(gy + 8), + vld1q_s32(gy + 12) + } + }; + + // Compute and store phase + vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0)); + vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1)); + + // Compute magnitude using L1 normalization + const uint32x4x2_t mag0 = mag_l1_S32_S32(gx0, gy0); + const uint32x4x2_t mag1 = mag_l1_S32_S32(gx1, gy1); + + // Store magnitude + vst1q_u32(magnitude + 0, mag0.val[0]); + vst1q_u32(magnitude + 4, mag0.val[1]); + vst1q_u32(magnitude + 8, mag1.val[0]); + vst1q_u32(magnitude + 12, mag1.val[1]); + } +} + +/* Gradient function used when the gradient size = 7 and when the norm_type = L2-norm + * + * @param[in] gx_ptr Pointer to source image. Gx image. Data type supported S32 + * @param[in] gy_ptr Pointer to source image. Gy image. Data type supported S32 + * @param[out] magnitude_ptr Pointer to destination image. Magnitude. Data type supported U32 + * @param[out] phase_ptr Pointer to destination image. Quantized phase. Data type supported U8 + */ +void mag_phase_l2norm_S32_S32_U32_U8(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr) +{ + auto gx = static_cast(gx_ptr); + auto gy = static_cast(gy_ptr); + auto magnitude = static_cast(magnitude_ptr); + auto phase = static_cast(phase_ptr); + + // Process low and high part + for(size_t i = 0; i < 2; ++i, gx += 16, gy += 16, magnitude += 16, phase += 16) + { + const int32x4x2_t gx0 = + { + { + vld1q_s32(gx + 0), + vld1q_s32(gx + 4) + } + }; + + const int32x4x2_t gx1 = + { + { + vld1q_s32(gx + 8), + vld1q_s32(gx + 12) + } + }; + + const int32x4x2_t gy0 = + { + { + vld1q_s32(gy + 0), + vld1q_s32(gy + 4) + } + }; + + const int32x4x2_t gy1 = + { + { + vld1q_s32(gy + 8), + vld1q_s32(gy + 12) + } + }; + + // Compute and store phase + vst1_u8(phase + 0, phase_quantization_S32_S32(gx0, gy0)); + vst1_u8(phase + 8, phase_quantization_S32_S32(gx1, gy1)); + + // Compute magnitude using L2 normalization + const uint32x4x2_t mag0 = mag_l2_S32_S32(gx0, gy0); + const uint32x4x2_t mag1 = mag_l2_S32_S32(gx1, gy1); + + // Store magnitude + vst1q_u32(magnitude + 0, mag0.val[0]); + vst1q_u32(magnitude + 4, mag0.val[1]); + vst1q_u32(magnitude + 8, mag1.val[0]); + vst1q_u32(magnitude + 12, mag1.val[1]); + } +} + +/* Computes non-maxima suppression and hysteresis when the gradient size = 3 or 5 + * + * @param[in] magnitude_ptr Pointer to source image. Magnitude. Data type supported U16 + * @param[in] phase_ptr Pointer to source image. Quantized phase. Data type supported U8 + * @param[out] output_ptr Pointer to output image. Data type supported U8 + * @param[in] stride_mag Stride of magnitude image + * @param[in] lower_thr Lower threshold used for the hysteresis + * @param[in] upper_thr Upper threshold used for the hysteresis + */ +void non_max_suppression_U16_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr, + const int32_t upper_thr) +{ + const auto magnitude = static_cast(magnitude_ptr); + const auto phase = static_cast(phase_ptr); + const auto output = static_cast(output_ptr); + + // Get magnitude and phase of the centre pixels + uint16x8_t mc = vld1q_u16(magnitude); + + // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135° + const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase)); + + // 0 degree + const uint16x8_t mk0_0 = vld1q_u16(magnitude - 1); + const uint16x8_t mk0_1 = vld1q_u16(magnitude + 1); + uint16x8_t mask0 = vceqq_u16(pc16, vdupq_n_u16(0)); + mask0 = vandq_u16(mask0, vcgeq_u16(mc, mk0_0)); + mask0 = vandq_u16(mask0, vcgeq_u16(mc, mk0_1)); + + // 45 degree + const uint16x8_t mk45_0 = vld1q_u16(magnitude - stride_mag - 1); + const uint16x8_t mk45_1 = vld1q_u16(magnitude + stride_mag + 1); + uint16x8_t mask1 = vceqq_u16(pc16, vdupq_n_u16(1)); + mask1 = vandq_u16(mask1, vcgeq_u16(mc, mk45_0)); + mask1 = vandq_u16(mask1, vcgeq_u16(mc, mk45_1)); + + // 90 degree + const uint16x8_t mk90_0 = vld1q_u16(magnitude - stride_mag); + const uint16x8_t mk90_1 = vld1q_u16(magnitude + stride_mag); + uint16x8_t mask2 = vceqq_u16(pc16, vdupq_n_u16(2)); + mask2 = vandq_u16(mask2, vcgeq_u16(mc, mk90_0)); + mask2 = vandq_u16(mask2, vcgeq_u16(mc, mk90_1)); + + // 135 degree + const uint16x8_t mk135_0 = vld1q_u16(magnitude - stride_mag + 1); + const uint16x8_t mk135_1 = vld1q_u16(magnitude + stride_mag - 1); + uint16x8_t mask3 = vceqq_u16(pc16, vdupq_n_u16(3)); + mask3 = vandq_u16(mask3, vcgeq_u16(mc, mk135_0)); + mask3 = vandq_u16(mask3, vcgeq_u16(mc, mk135_1)); + + // Merge masks + mask0 = vorrq_u16(mask0, mask1); + mask2 = vorrq_u16(mask2, mask3); + mask0 = vorrq_u16(mask0, mask2); + + mc = vbslq_u16(mask0, mc, vdupq_n_u16(0)); + + // mc > upper_thr + mask0 = vcgtq_u16(mc, vdupq_n_u16(upper_thr)); + + // mc <= lower_thr + mask1 = vcleq_u16(mc, vdupq_n_u16(lower_thr)); + + // mc <= upper_thr && mc > lower_thr + mask2 = vcleq_u16(mc, vdupq_n_u16(upper_thr)); + mask2 = vandq_u16(mask2, vcgtq_u16(mc, vdupq_n_u16(lower_thr))); + + mc = vbslq_u16(mask0, vdupq_n_u16(EDGE), mc); + mc = vbslq_u16(mask1, vdupq_n_u16(NO_EDGE), mc); + mc = vbslq_u16(mask2, vdupq_n_u16(MAYBE), mc); + + vst1_u8(output, vmovn_u16(mc)); +} + +inline uint16x4_t non_max_U32_helper(const uint32_t *input, const uint16x4_t pc, const uint32_t stride_mag, const int32_t lower_thr, const int32_t upper_thr) +{ + // Phase for 4 pixel + const uint32x4_t pc32 = vmovl_u16(pc); + + // Get magnitude for 4 pixel + uint32x4_t mc = vld1q_u32(input); + + // Angle_quantized: 0 = 0°, 1 = 45°, 2 = 90°, 3 = 135° + // 0 degree + const uint32x4_t mk0_0 = vld1q_u32(input - 1); + const uint32x4_t mk0_1 = vld1q_u32(input + 1); + uint32x4_t mask0 = vceqq_u32(pc32, vdupq_n_u32(0)); + mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_0)); + mask0 = vandq_u32(mask0, vcgeq_u32(mc, mk0_1)); + + // 45 degree + const uint32x4_t mk45_0 = vld1q_u32(input - stride_mag - 1); + const uint32x4_t mk45_1 = vld1q_u32(input + stride_mag + 1); + uint32x4_t mask1 = vceqq_u32(pc32, vdupq_n_u32(1)); + mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_0)); + mask1 = vandq_u32(mask1, vcgeq_u32(mc, mk45_1)); + + // 90 degree + const uint32x4_t mk90_0 = vld1q_u32(input - stride_mag); + const uint32x4_t mk90_1 = vld1q_u32(input + stride_mag); + uint32x4_t mask2 = vceqq_u32(pc32, vdupq_n_u32(2)); + mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_0)); + mask2 = vandq_u32(mask2, vcgeq_u32(mc, mk90_1)); + + // 135 degree + const uint32x4_t mk135_0 = vld1q_u32(input - stride_mag + 1); + const uint32x4_t mk135_1 = vld1q_u32(input + stride_mag - 1); + uint32x4_t mask3 = vceqq_u32(pc32, vdupq_n_u32(3)); + mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_0)); + mask3 = vandq_u32(mask3, vcgeq_u32(mc, mk135_1)); + + // Merge masks + mask0 = vorrq_u32(mask0, mask1); + mask2 = vorrq_u32(mask2, mask3); + mask0 = vorrq_u32(mask0, mask2); + + mc = vbslq_u32(mask0, mc, vdupq_n_u32(0)); + + // mc > upper_thr + mask0 = vcgtq_u32(mc, vdupq_n_u32(upper_thr)); + + // mc <= lower_thr + mask1 = vcleq_u32(mc, vdupq_n_u32(lower_thr)); + + // mc <= upper_thr && mc > lower_thr + mask2 = vcleq_u32(mc, vdupq_n_u32(upper_thr)); + mask2 = vandq_u32(mask2, vcgtq_u32(mc, vdupq_n_u32(lower_thr))); + + mc = vbslq_u32(mask0, vdupq_n_u32(EDGE), mc); + mc = vbslq_u32(mask1, vdupq_n_u32(NO_EDGE), mc); + mc = vbslq_u32(mask2, vdupq_n_u32(MAYBE), mc); + + return vmovn_u32(mc); +} + +/* Computes non-maxima suppression and hysteresis when the gradient_size = 7 + * + * @param[in] magnitude_ptr Pointer to source image. Magnitude. Data type supported U32 + * @param[in] phase_ptr Pointer to source image. Quantized phase. Data type supported U8 + * @param[out] output_ptr Pointer to destination image. Data type supported U8 + * @param[in] stride_mag Stride of magnitude image + * @param[in] lower_thr Lower threshold used for the hysteresis + * @param[in] upper_thr Upper threshold used for the hysteresis + */ +void non_max_suppression_U32_U8_U8(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t lower_thr, + const int32_t upper_thr) +{ + const auto magnitude = static_cast(magnitude_ptr); + const auto phase = static_cast(phase_ptr); + const auto output = static_cast(output_ptr); + + // Get phase for 8 pixel + const uint16x8_t pc16 = vmovl_u8(vld1_u8(phase)); + + // Compute non maxima suppression + const uint16x4x2_t res = + { + { + non_max_U32_helper(magnitude, vget_low_u16(pc16), stride_mag, lower_thr, upper_thr), + non_max_U32_helper(magnitude + 4, vget_high_u16(pc16), stride_mag, lower_thr, upper_thr) + } + }; + + // Store result + vst1_u8(output, vmovn_u16(vcombine_u16(res.val[0], res.val[1]))); +} + +/* Computes edge tracing when is called by edge_trace_U8_U8 recursively + * + * @param[in] input Pointer to source image. Data type supported U8 + * @param[out] output Pointer to destination image. Data type supported U8 + * @param[in] input_stride Stride of the input image + * @param[in] output_stride Stride of the output image + */ +void edge_trace_recursive_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride) +{ + // Look for MAYBE pixels in 8 directions + *output = EDGE; + + // (-1, 0) + uint8_t pixel = *(input - 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(input - 1) = EDGE; + + edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride); + } + + // (+1, 0) + pixel = *(input + 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(input + 1) = EDGE; + + edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride); + } + + input -= input_stride; + output -= output_stride; + + // (-1, -1) + pixel = *(input - 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(input - 1) = EDGE; + + edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride); + } + + // (0, -1) + pixel = *input; + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *input = EDGE; + + edge_trace_recursive_U8_U8(input, output, input_stride, output_stride); + } + + // (+1, -1) + pixel = *(input + 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(input + 1) = EDGE; + + edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride); + } + + input += input_stride * 2; + output += output_stride * 2; + + // (-1, +1) + pixel = *(input - 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(input - 1) = EDGE; + + edge_trace_recursive_U8_U8(input - 1, output - 1, input_stride, output_stride); + } + + // (0, +1) + pixel = *input; + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *input = EDGE; + + edge_trace_recursive_U8_U8(input, output, input_stride, output_stride); + } + + // (+1, +1) + pixel = *(input + 1); + + if(pixel == MAYBE) + { + // Touched a MAYBE point. MAYBE becomes EDGE + *(input + 1) = EDGE; + + edge_trace_recursive_U8_U8(input + 1, output + 1, input_stride, output_stride); + } +} + +/* Computes edge tracing + * + * @param[in] input Pointer to source image. Data type supported U8 + * @param[out] output Pointer to destination image. Data type supported U8 + * @param[in] input_stride Stride of the input image + * @param[in] output_stride Stride of the output image + */ +void edge_trace_U8_U8(uint8_t *__restrict input, uint8_t *__restrict output, const int32_t input_stride, const int32_t output_stride) +{ + if(*input == NO_EDGE) + { + *output = NO_EDGE; + } + // Check if EDGE and not yet touched + else if((*input == EDGE) && (*output == NO_EDGE)) + { + edge_trace_recursive_U8_U8(input, output, input_stride, output_stride); + } +} +} // namespace + +NEGradientKernel::NEGradientKernel() + : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr) +{ +} + +void NEGradientKernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(gx, gy, magnitude, phase); + + set_shape_if_empty(*magnitude->info(), gx->info()->tensor_shape()); + set_shape_if_empty(*phase->info(), gx->info()->tensor_shape()); + + Format magnitude_format = gx->info()->data_type() == DataType::S16 ? Format::U16 : Format::U32; + set_format_if_unknown(*magnitude->info(), magnitude_format); + set_format_if_unknown(*phase->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(gx, gy, magnitude, phase); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(gx, gy); + ARM_COMPUTE_ERROR_ON_MSG(element_size_from_data_type(gx->info()->data_type()) != element_size_from_data_type(magnitude->info()->data_type()), "Magnitude must have the same element size as Gx and Gy"); + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + if(_gx->info()->data_type() == DataType::S16) + { + if(norm_type == 1) + { + _func = &mag_phase_l1norm_S16_S16_U16_U8; + } + else + { + _func = &mag_phase_l2norm_S16_S16_U16_U8; + } + } + else + { + if(norm_type == 1) + { + _func = &mag_phase_l1norm_S32_S32_U32_U8; + } + else + { + _func = &mag_phase_l2norm_S32_S32_U32_U8; + } + } + + constexpr unsigned int num_elems_processed_per_iteration = 32; + + // Configure kernel window + Window win = calculate_max_window(*_gx->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal gx_access(_gx->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal gy_access(_gy->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal mag_access(_magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, gx_access, gy_access, mag_access, phase_access); + + mag_access.set_valid_region(win, _gx->info()->valid_region()); + phase_access.set_valid_region(win, _gx->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEGradientKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + Iterator gx(_gx, window); + Iterator gy(_gy, window); + Iterator magnitude(_magnitude, window); + Iterator phase(_phase, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + (*_func)(gx.ptr(), gy.ptr(), magnitude.ptr(), phase.ptr()); + }, + gx, gy, magnitude, phase); +} + +NEEdgeNonMaxSuppressionKernel::NEEdgeNonMaxSuppressionKernel() + : _func(nullptr), _magnitude(nullptr), _phase(nullptr), _output(nullptr), _lower_thr(0), _upper_thr(0) +{ +} + +BorderSize NEEdgeNonMaxSuppressionKernel::border_size() const +{ + return BorderSize(1); +} + +void NEEdgeNonMaxSuppressionKernel::configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, + int32_t upper_thr, int32_t lower_thr, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(magnitude, phase, output); + + set_shape_if_empty(*output->info(), magnitude->info()->tensor_shape()); + + set_format_if_unknown(*phase->info(), Format::U8); + set_format_if_unknown(*output->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(magnitude, phase, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::U16, DataType::U32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(phase, output); + + _magnitude = magnitude; + _phase = phase; + _output = output; + + switch(_magnitude->info()->data_type()) + { + case DataType::U16: + _func = &non_max_suppression_U16_U8_U8; + break; + case DataType::U32: + _func = &non_max_suppression_U32_U8_U8; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type!"); + } + + // Set thresholds + _lower_thr = lower_thr; + _upper_thr = upper_thr; + + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 10; + constexpr unsigned int num_rows_read_per_iteration = 3; + + // Configure kernel window + Window win = calculate_max_window(*_magnitude->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowRectangle mag_access(_magnitude->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal phase_access(_phase->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(_output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, mag_access, phase_access, output_access); + + output_access.set_valid_region(win, _magnitude->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEEdgeNonMaxSuppressionKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + Iterator magnitude(_magnitude, window); + Iterator phase(_phase, window); + Iterator output(_output, window); + + const size_t input1_stride = _magnitude->info()->strides_in_bytes()[1]; + const size_t input1_stride_ushort = input1_stride / data_size_from_type(_magnitude->info()->data_type()); + + execute_window_loop(window, [&](const Coordinates & id) + { + (*_func)(magnitude.ptr(), phase.ptr(), output.ptr(), input1_stride_ushort, _lower_thr, _upper_thr); + }, + magnitude, phase, output); +} + +NEEdgeTraceKernel::NEEdgeTraceKernel() + : _input(nullptr), _output(nullptr) +{ +} + +BorderSize NEEdgeTraceKernel::border_size() const +{ + return BorderSize(1); +} + +bool NEEdgeTraceKernel::is_parallelisable() const +{ + return false; +} + +void NEEdgeTraceKernel::configure(ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + set_format_if_unknown(*input->info(), Format::U8); + set_format_if_unknown(*output->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); + + const ValidRegion &input_valid_region = input->info()->valid_region(); + const ValidRegion &output_valid_region = output->info()->valid_region(); + + // Reads can occur within the valid region of the input + border + AccessWindowStatic input_access(input->info(), + input_valid_region.anchor[0] - border_size().left, + input_valid_region.anchor[1] - border_size().top, + input_valid_region.anchor[0] + input_valid_region.shape[0] + border_size().right, + input_valid_region.anchor[1] + input_valid_region.shape[1] + border_size().bottom); + + // Writes can occur within the valid region of the output + border + AccessWindowStatic output_access(output->info(), + output_valid_region.anchor[0] - border_size().left, + output_valid_region.anchor[1] - border_size().top, + output_valid_region.anchor[0] + output_valid_region.shape[0] + border_size().right, + output_valid_region.anchor[1] + output_valid_region.shape[1] + border_size().bottom); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, _input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEEdgeTraceKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + Iterator input(_input, window); + Iterator output(_output, window); + + const size_t input_stride = _input->info()->strides_in_bytes()[1]; + const size_t output_stride = _output->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + edge_trace_U8_U8(input.ptr(), output.ptr(), input_stride, output_stride); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NEChannelCombineKernel.cpp b/src/core/NEON/kernels/NEChannelCombineKernel.cpp new file mode 100644 index 0000000000..3147a698ad --- /dev/null +++ b/src/core/NEON/kernels/NEChannelCombineKernel.cpp @@ -0,0 +1,467 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/IMultiImage.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEChannelCombineKernel::NEChannelCombineKernel() + : _func(nullptr), _planes{ { nullptr } }, _output(nullptr), _output_multi(nullptr), _x_subsampling{ { 1, 1, 1 } }, _y_subsampling{ { 1, 1, 1 } }, _num_elems_processed_per_iteration(8), +_is_parallelizable(true) +{ +} + +void NEChannelCombineKernel::configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output); + ARM_COMPUTE_ERROR_ON(plane0 == output); + ARM_COMPUTE_ERROR_ON(plane1 == output); + ARM_COMPUTE_ERROR_ON(plane2 == output); + + set_format_if_unknown(*plane0->info(), Format::U8); + set_format_if_unknown(*plane1->info(), Format::U8); + set_format_if_unknown(*plane2->info(), Format::U8); + + if(plane3 != nullptr) + { + set_format_if_unknown(*plane3->info(), Format::U8); + } + + set_shape_if_empty(*output->info(), plane0->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2); + + if(plane3 != nullptr) + { + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, plane3); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane3); + } + + const Format &output_format = output->info()->format(); + + if(output_format == Format::RGBA8888) + { + ARM_COMPUTE_ERROR_ON(plane3 == output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane3, 1, DataType::U8); + } + + _planes[0] = plane0; + _planes[1] = plane1; + _planes[2] = plane2; + _planes[3] = plane3; + _output = output; + _output_multi = nullptr; + + _num_elems_processed_per_iteration = 8; + _is_parallelizable = true; + + switch(output_format) + { + case Format::RGB888: + _func = &NEChannelCombineKernel::combine_3C; + break; + case Format::RGBA8888: + _func = &NEChannelCombineKernel::combine_4C; + break; + case Format::UYVY422: + _x_subsampling[1] = 2; + _x_subsampling[2] = 2; + _num_elems_processed_per_iteration = 16; + _func = &NEChannelCombineKernel::combine_YUV_1p; + break; + case Format::YUYV422: + _x_subsampling[1] = 2; + _x_subsampling[2] = 2; + _num_elems_processed_per_iteration = 16; + _func = &NEChannelCombineKernel::combine_YUV_1p; + break; + default: + ARM_COMPUTE_ERROR("Not supported format."); + break; + } + + TensorShape subsampled_shape_plane1{ plane0->info()->tensor_shape() }; + subsampled_shape_plane1.set(0, subsampled_shape_plane1[0] / _x_subsampling[1]); + TensorShape subsampled_shape_plane2{ plane0->info()->tensor_shape() }; + subsampled_shape_plane2.set(0, subsampled_shape_plane2[0] / _x_subsampling[2]); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane1->info()->tensor_shape(), subsampled_shape_plane1); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(plane2->info()->tensor_shape(), subsampled_shape_plane2); + + Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration)); + + AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration); + AccessWindowHorizontal plane0_access(plane0->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[0]); + AccessWindowHorizontal plane1_access(plane1->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[1]); + AccessWindowHorizontal plane2_access(plane2->info(), 0, _num_elems_processed_per_iteration / _x_subsampling[1], 1.f / _x_subsampling[2]); + AccessWindowHorizontal plane3_access(plane3 == nullptr ? nullptr : plane3->info(), 0, _num_elems_processed_per_iteration); + + update_window_and_padding( + win, + plane0_access, + plane1_access, + plane2_access, + plane3_access, + output_access); + + ValidRegion valid_region = intersect_valid_regions(plane0->info()->valid_region(), + plane1->info()->valid_region(), + plane2->info()->valid_region()); + + if(plane3 != nullptr) + { + valid_region = intersect_valid_regions(plane3->info()->valid_region(), valid_region); + } + + output_access.set_valid_region(win, ValidRegion(valid_region.anchor, output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEChannelCombineKernel::configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(plane0, plane1, plane2, output); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane0); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane1); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(plane2); + + set_format_if_unknown(*plane0->info(), Format::U8); + set_format_if_unknown(*plane1->info(), Format::U8); + set_format_if_unknown(*plane2->info(), Format::U8); + + set_shape_if_empty(*output->plane(0)->info(), plane0->info()->tensor_shape()); + + switch(output->info()->format()) + { + case Format::NV12: + case Format::NV21: + case Format::IYUV: + { + TensorShape subsampled_shape = plane0->info()->tensor_shape(); + subsampled_shape.set(0, subsampled_shape[0] / 2); + subsampled_shape.set(1, subsampled_shape[1] / 2); + + set_shape_if_empty(*output->plane(1)->info(), subsampled_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(1)->info()->tensor_shape(), subsampled_shape); + + if(output->info()->format() == Format::IYUV) + { + set_shape_if_empty(*output->plane(2)->info(), subsampled_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->plane(2)->info()->tensor_shape(), subsampled_shape); + } + break; + } + case Format::YUV444: + set_shape_if_empty(*output->plane(1)->info(), plane0->info()->tensor_shape()); + set_shape_if_empty(*output->plane(2)->info(), plane0->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane1, plane2, output->plane(1), output->plane(2)); + break; + default: + ARM_COMPUTE_ERROR("Unsupported format"); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(plane0, output->plane(0)); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane0, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(plane2, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(plane0, plane1, plane2); + + _planes[0] = plane0; + _planes[1] = plane1; + _planes[2] = plane2; + _planes[3] = nullptr; + _output = nullptr; + _output_multi = output; + bool has_two_planes = false; + unsigned int num_elems_written_plane1 = 8; + + _num_elems_processed_per_iteration = 8; + _is_parallelizable = true; + + const Format &output_format = output->info()->format(); + + switch(output_format) + { + case Format::NV12: + case Format::NV21: + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + _func = &NEChannelCombineKernel::combine_YUV_2p; + has_two_planes = true; + num_elems_written_plane1 = 16; + break; + case Format::IYUV: + _is_parallelizable = false; + _x_subsampling = { { 1, 2, 2 } }; + _y_subsampling = { { 1, 2, 2 } }; + _func = &NEChannelCombineKernel::combine_YUV_3p; + break; + case Format::YUV444: + _is_parallelizable = false; + _x_subsampling = { { 1, 1, 1 } }; + _y_subsampling = { { 1, 1, 1 } }; + _func = &NEChannelCombineKernel::combine_YUV_3p; + break; + default: + ARM_COMPUTE_ERROR("Not supported format."); + break; + } + + const unsigned int y_step = *std::max_element(_y_subsampling.begin(), _y_subsampling.end()); + + Window win = calculate_max_window(*plane0->info(), Steps(_num_elems_processed_per_iteration, y_step)); + AccessWindowRectangle output_plane0_access(output->plane(0)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f, 1.f / _y_subsampling[0]); + AccessWindowRectangle output_plane1_access(output->plane(1)->info(), 0, 0, num_elems_written_plane1, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]); + AccessWindowRectangle output_plane2_access(has_two_planes ? nullptr : output->plane(2)->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]); + + update_window_and_padding(win, + AccessWindowHorizontal(plane0->info(), 0, _num_elems_processed_per_iteration), + AccessWindowRectangle(plane1->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[1], 1.f / _y_subsampling[1]), + AccessWindowRectangle(plane2->info(), 0, 0, _num_elems_processed_per_iteration, 1, 1.f / _x_subsampling[2], 1.f / _y_subsampling[2]), + output_plane0_access, + output_plane1_access, + output_plane2_access); + + ValidRegion plane0_valid_region = plane0->info()->valid_region(); + + ValidRegion output_plane1_region = has_two_planes ? intersect_valid_regions(plane1->info()->valid_region(), plane2->info()->valid_region()) : plane2->info()->valid_region(); + + output_plane0_access.set_valid_region(win, ValidRegion(plane0_valid_region.anchor, output->plane(0)->info()->tensor_shape())); + output_plane1_access.set_valid_region(win, ValidRegion(output_plane1_region.anchor, output->plane(1)->info()->tensor_shape())); + output_plane2_access.set_valid_region(win, ValidRegion(plane2->info()->valid_region().anchor, output->plane(2)->info()->tensor_shape())); + + INEKernel::configure(win); +} + +bool NEChannelCombineKernel::is_parallelisable() const +{ + return _is_parallelizable; +} + +void NEChannelCombineKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} + +void NEChannelCombineKernel::combine_3C(const Window &win) +{ + Iterator p0(_planes[0], win); + Iterator p1(_planes[1], win); + Iterator p2(_planes[2], win); + Iterator out(_output, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto p0_ptr = static_cast(p0.ptr()); + const auto p1_ptr = static_cast(p1.ptr()); + const auto p2_ptr = static_cast(p2.ptr()); + const auto out_ptr = static_cast(out.ptr()); + + const uint8x8x3_t pixels = + { + { + vld1_u8(p0_ptr), + vld1_u8(p1_ptr), + vld1_u8(p2_ptr) + } + }; + + vst3_u8(out_ptr, pixels); + }, + p0, p1, p2, out); +} + +void NEChannelCombineKernel::combine_4C(const Window &win) +{ + Iterator p0(_planes[0], win); + Iterator p1(_planes[1], win); + Iterator p2(_planes[2], win); + Iterator p3(_planes[3], win); + Iterator out(_output, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto p0_ptr = static_cast(p0.ptr()); + const auto p1_ptr = static_cast(p1.ptr()); + const auto p2_ptr = static_cast(p2.ptr()); + const auto p3_ptr = static_cast(p3.ptr()); + const auto out_ptr = static_cast(out.ptr()); + + const uint8x8x4_t pixels = + { + { + vld1_u8(p0_ptr), + vld1_u8(p1_ptr), + vld1_u8(p2_ptr), + vld1_u8(p3_ptr) + } + }; + + vst4_u8(out_ptr, pixels); + }, + p0, p1, p2, p3, out); +} + +template +void NEChannelCombineKernel::combine_YUV_1p(const Window &win) +{ + // Create sub-sampled uv window and init uv planes + Window win_uv(win); + win_uv.set_dimension_step(0, win.x().step() / _x_subsampling[1]); + win_uv.validate(); + + Iterator p0(_planes[0], win); + Iterator p1(_planes[1], win_uv); + Iterator p2(_planes[2], win_uv); + Iterator out(_output, win); + + constexpr auto shift = is_uyvy ? 1 : 0; + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto p0_ptr = static_cast(p0.ptr()); + const auto p1_ptr = static_cast(p1.ptr()); + const auto p2_ptr = static_cast(p2.ptr()); + const auto out_ptr = static_cast(out.ptr()); + + const uint8x8x2_t pixels_y = vld2_u8(p0_ptr); + const uint8x8x2_t pixels_uv = + { + { + vld1_u8(p1_ptr), + vld1_u8(p2_ptr) + } + }; + + uint8x8x4_t pixels{ {} }; + pixels.val[0 + shift] = pixels_y.val[0]; + pixels.val[1 - shift] = pixels_uv.val[0]; + pixels.val[2 + shift] = pixels_y.val[1]; + pixels.val[3 - shift] = pixels_uv.val[1]; + + vst4_u8(out_ptr, pixels); + }, + p0, p1, p2, out); +} + +void NEChannelCombineKernel::combine_YUV_2p(const Window &win) +{ + ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[1]); + ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[1]); + + // Copy first plane + copy_plane(win, 0); + + // Update UV window + Window uv_win(win); + uv_win.set(Window::DimX, Window::Dimension(uv_win.x().start() / _x_subsampling[1], uv_win.x().end() / _x_subsampling[1], _num_elems_processed_per_iteration)); + uv_win.set(Window::DimY, Window::Dimension(uv_win.y().start() / _y_subsampling[1], uv_win.y().end() / _y_subsampling[1], 1)); + uv_win.validate(); + + // Update output win + Window out_win(win); + out_win.set(Window::DimX, Window::Dimension(out_win.x().start(), out_win.x().end(), out_win.x().step() * 2)); + out_win.set(Window::DimY, Window::Dimension(out_win.y().start() / _y_subsampling[1], out_win.y().end() / _y_subsampling[1], 1)); + out_win.validate(); + + // Construct second plane + const int shift = (Format::NV12 == _output_multi->info()->format()) ? 0 : 1; + Iterator p1(_planes[1 + shift], uv_win); + Iterator p2(_planes[2 - shift], uv_win); + Iterator out(_output_multi->plane(1), out_win); + + execute_window_loop(out_win, [&](const Coordinates & id) + { + const uint8x8x2_t pixels = + { + { + vld1_u8(p1.ptr()), + vld1_u8(p2.ptr()) + } + }; + + vst2_u8(out.ptr(), pixels); + }, + p1, p2, out); +} + +void NEChannelCombineKernel::combine_YUV_3p(const Window &win) +{ + copy_plane(win, 0); + copy_plane(win, 1); + copy_plane(win, 2); +} + +void NEChannelCombineKernel::copy_plane(const Window &win, uint32_t plane_id) +{ + ARM_COMPUTE_ERROR_ON(win.x().start() % _x_subsampling[plane_id]); + ARM_COMPUTE_ERROR_ON(win.y().start() % _y_subsampling[plane_id]); + + // Update window + Window tmp_win(win); + tmp_win.set(Window::DimX, Window::Dimension(tmp_win.x().start() / _x_subsampling[plane_id], tmp_win.x().end() / _x_subsampling[plane_id], _num_elems_processed_per_iteration)); + tmp_win.set(Window::DimY, Window::Dimension(tmp_win.y().start() / _y_subsampling[plane_id], tmp_win.y().end() / _y_subsampling[plane_id], 1)); + tmp_win.validate(); + + Iterator in(_planes[plane_id], tmp_win); + Iterator out(_output_multi->plane(plane_id), tmp_win); + + execute_window_loop(tmp_win, [&](const Coordinates & id) + { + const auto in_ptr = static_cast(in.ptr()); + const auto out_ptr = static_cast(out.ptr()); + + vst1_u8(out_ptr, vld1_u8(in_ptr)); + }, + in, out); +} diff --git a/src/core/NEON/kernels/NEChannelExtractKernel.cpp b/src/core/NEON/kernels/NEChannelExtractKernel.cpp new file mode 100644 index 0000000000..ebc4b85c98 --- /dev/null +++ b/src/core/NEON/kernels/NEChannelExtractKernel.cpp @@ -0,0 +1,354 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/IMultiImage.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEChannelExtractKernel::NEChannelExtractKernel() + : _func(nullptr), _lut_index(0) +{ +} + +void NEChannelExtractKernel::configure(const ITensor *input, Channel channel, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_ON(input == output); + + set_format_if_unknown(*output->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::RGB888, Format::RGBA8888, Format::UYVY422, Format::YUYV422); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); + + unsigned int num_elems_processed_per_iteration = 8; + + // Check format and channel + const Format format = input->info()->format(); + const unsigned int subsampling = (format == Format::YUYV422 || format == Format::UYVY422) && channel != Channel::Y ? 2 : 1; + TensorShape output_shape; + + switch(format) + { + case Format::RGB888: + case Format::RGBA8888: + num_elems_processed_per_iteration = 16; + output_shape = input->info()->tensor_shape(); + + if(format == Format::RGB888) + { + _func = &NEChannelExtractKernel::extract_1C_from_3C_img; + } + else if(format == Format::RGBA8888) + { + _func = &NEChannelExtractKernel::extract_1C_from_4C_img; + } + + switch(channel) + { + case Channel::R: + _lut_index = 0; + break; + case Channel::G: + _lut_index = 1; + break; + case Channel::B: + _lut_index = 2; + break; + case Channel::A: + if(format == Format::RGBA8888) + { + _lut_index = 3; + _func = &NEChannelExtractKernel::extract_1C_from_4C_img; + break; + } + default: + ARM_COMPUTE_ERROR("Not supported channel for this format."); + break; + } + break; + case Format::YUYV422: + case Format::UYVY422: + output_shape = input->info()->tensor_shape(); + + if(channel != Channel::Y) + { + output_shape.set(0, output_shape[0] / 2); + } + + switch(channel) + { + case Channel::Y: + num_elems_processed_per_iteration = 16; + _func = &NEChannelExtractKernel::extract_1C_from_2C_img; + _lut_index = (Format::YUYV422 == format) ? 0 : 1; + break; + case Channel::U: + num_elems_processed_per_iteration = 32; + _func = &NEChannelExtractKernel::extract_YUYV_uv; + _lut_index = (Format::YUYV422 == format) ? 1 : 0; + break; + case Channel::V: + num_elems_processed_per_iteration = 32; + _func = &NEChannelExtractKernel::extract_YUYV_uv; + _lut_index = (Format::YUYV422 == format) ? 3 : 2; + break; + default: + ARM_COMPUTE_ERROR("Not supported channel for this format."); + break; + } + break; + default: + ARM_COMPUTE_ERROR("Not supported format."); + break; + } + + set_shape_if_empty(*output->info(), output_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowRectangle output_access(input->info(), 0, 0, num_elems_processed_per_iteration, 1, 1.f / subsampling, 1.f / subsampling); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); + + ValidRegion input_valid_region = input->info()->valid_region(); + + output_access.set_valid_region(win, ValidRegion(input_valid_region.anchor, output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEChannelExtractKernel::configure(const IMultiImage *input, Channel channel, IImage *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + + set_format_if_unknown(*output->info(), Format::U8); + + switch(input->info()->format()) + { + case Format::NV12: + case Format::NV21: + case Format::IYUV: + switch(channel) + { + case Channel::Y: + set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output); + break; + case Channel::U: + case Channel::V: + set_shape_if_empty(*output->info(), input->plane(1)->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(1), output); + break; + default: + ARM_COMPUTE_ERROR("Unsupported channel for selected format"); + } + break; + case Format::YUV444: + set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output); + break; + default: + ARM_COMPUTE_ERROR("Unsupported format"); + } + + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::NV12, Format::NV21, Format::IYUV, Format::YUV444); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); + + unsigned int num_elems_processed_per_iteration = 32; + + const Format &format = input->info()->format(); + + switch(format) + { + case Format::NV12: + case Format::NV21: + switch(channel) + { + case Channel::Y: + _input = input->plane(0); + _func = &NEChannelExtractKernel::copy_plane; + break; + case Channel::U: + _input = input->plane(1); + num_elems_processed_per_iteration = 16; + _func = &NEChannelExtractKernel::extract_1C_from_2C_img; + _lut_index = (Format::NV12 == format) ? 0 : 1; + break; + case Channel::V: + _input = input->plane(1); + num_elems_processed_per_iteration = 16; + _func = &NEChannelExtractKernel::extract_1C_from_2C_img; + _lut_index = (Format::NV12 == format) ? 1 : 0; + break; + default: + ARM_COMPUTE_ERROR("Not supported channel for this format."); + break; + } + break; + case Format::IYUV: + case Format::YUV444: + _func = &NEChannelExtractKernel::copy_plane; + switch(channel) + { + case Channel::Y: + _input = input->plane(0); + break; + case Channel::U: + _input = input->plane(1); + break; + case Channel::V: + _input = input->plane(2); + break; + default: + ARM_COMPUTE_ERROR("Not supported channel for this format."); + break; + } + break; + default: + ARM_COMPUTE_ERROR("Not supported format."); + break; + } + + _output = output; + Window win = calculate_max_window(*_input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal input_access(_input->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, _input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEChannelExtractKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} + +void NEChannelExtractKernel::extract_1C_from_2C_img(const Window &win) +{ + Iterator in(_input, win); + Iterator out(_output, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto in_ptr = static_cast(in.ptr()); + const auto out_ptr = static_cast(out.ptr()); + const auto pixels = vld2q_u8(in_ptr); + vst1q_u8(out_ptr, pixels.val[_lut_index]); + }, + in, out); +} + +void NEChannelExtractKernel::extract_1C_from_3C_img(const Window &win) +{ + Iterator in(_input, win); + Iterator out(_output, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto in_ptr = static_cast(in.ptr()); + const auto out_ptr = static_cast(out.ptr()); + const auto pixels = vld3q_u8(in_ptr); + vst1q_u8(out_ptr, pixels.val[_lut_index]); + }, + in, out); +} + +void NEChannelExtractKernel::extract_1C_from_4C_img(const Window &win) +{ + Iterator in(_input, win); + Iterator out(_output, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto in_ptr = static_cast(in.ptr()); + const auto out_ptr = static_cast(out.ptr()); + const auto pixels = vld4q_u8(in_ptr); + vst1q_u8(out_ptr, pixels.val[_lut_index]); + }, + in, out); +} + +void NEChannelExtractKernel::extract_YUYV_uv(const Window &win) +{ + ARM_COMPUTE_ERROR_ON(win.x().step() % 2); + + Window win_out(win); + win_out.set_dimension_step(Window::DimX, win.x().step() / 2); + + Iterator in(_input, win); + Iterator out(_output, win_out); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto in_ptr = static_cast(in.ptr()); + const auto out_ptr = static_cast(out.ptr()); + const auto pixels = vld4q_u8(in_ptr); + vst1q_u8(out_ptr, pixels.val[_lut_index]); + }, + in, out); +} + +void NEChannelExtractKernel::copy_plane(const Window &win) +{ + Iterator in(_input, win); + Iterator out(_output, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto in_ptr = static_cast(in.ptr()); + const auto out_ptr = static_cast(out.ptr()); + vst4_u8(out_ptr, vld4_u8(in_ptr)); + }, + in, out); +} diff --git a/src/core/NEON/kernels/NECol2ImKernel.cpp b/src/core/NEON/kernels/NECol2ImKernel.cpp new file mode 100644 index 0000000000..6d370acff1 --- /dev/null +++ b/src/core/NEON/kernels/NECol2ImKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +template +void NECol2ImKernel::run_col2im(const Window &window) +{ + const int output_stride_x = _output->info()->strides_in_bytes().x(); + const int output_stride_y = _output->info()->strides_in_bytes().y(); + const int output_stride_z = _output->info()->strides_in_bytes().z(); + + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Create iterators + Iterator in(_input, window); + Iterator out(_output, window_out); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int hidx = id.y(); + const int idx = id.x() * output_stride_z + (hidx / _convolved_dims.first) * output_stride_y + (hidx % _convolved_dims.first) * output_stride_x; + + *(reinterpret_cast(out.ptr() + idx)) = *(reinterpret_cast(in.ptr())); + }, + in, out); +} + +NECol2ImKernel::NECol2ImKernel() + : _func(), _input(nullptr), _output(nullptr), _convolved_dims() +{ +} + +void NECol2ImKernel::configure(const ITensor *input, ITensor *output, std::pair convolved_dims) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + set_data_type_if_unknown(*output->info(), input->info()->data_type()); + + TensorShape output_shape = input->info()->tensor_shape(); + output_shape.set(0, convolved_dims.first); + output_shape.set(1, convolved_dims.second); + output_shape.set(2, input->info()->tensor_shape()[0]); + + set_shape_if_empty(*output->info(), output_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + _convolved_dims = convolved_dims; + + switch(input->info()->element_size()) + { + case 1: + _func = &NECol2ImKernel::run_col2im; + break; + case 2: + _func = &NECol2ImKernel::run_col2im; + break; + case 4: + _func = &NECol2ImKernel::run_col2im; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps()); + + // The NECol2ImKernel doesn't need padding so update_window_and_padding() can be skipped + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output->info()->set_valid_region(ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NECol2ImKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEColorConvertKernel.cpp b/src/core/NEON/kernels/NEColorConvertKernel.cpp new file mode 100644 index 0000000000..cb5152e2b3 --- /dev/null +++ b/src/core/NEON/kernels/NEColorConvertKernel.cpp @@ -0,0 +1,582 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/IMultiImage.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/MultiImageInfo.h" +#include "arm_compute/core/NEON/NEColorConvertHelper.inl" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NEColorConvertKernel::NEColorConvertKernel() + : _input(nullptr), _output(nullptr), _func(nullptr) +{ +} + +void NEColorConvertKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->format()) + { + case Format::RGBA8888: + { + switch(output->info()->format()) + { + case Format::RGB888: + _func = colorconvert_rgbx_to_rgb; + num_elems_processed_per_iteration = 16; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::UYVY422: + { + switch(output->info()->format()) + { + case Format::RGB888: + _func = colorconvert_yuyv_to_rgb; + num_elems_processed_per_iteration = 32; + break; + case Format::RGBA8888: + _func = colorconvert_yuyv_to_rgb; + num_elems_processed_per_iteration = 32; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::YUYV422: + { + switch(output->info()->format()) + { + case Format::RGB888: + _func = colorconvert_yuyv_to_rgb; + num_elems_processed_per_iteration = 32; + break; + case Format::RGBA8888: + _func = colorconvert_yuyv_to_rgb; + num_elems_processed_per_iteration = 32; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::RGB888: + { + switch(output->info()->format()) + { + case Format::RGBA8888: + _func = colorconvert_rgb_to_rgbx; + num_elems_processed_per_iteration = 16; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEColorConvertKernel::configure(const IMultiImage *input, IImage *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + + set_shape_if_empty(*output->info(), input->plane(0)->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output); + + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->format()) + { + case Format::NV12: + { + switch(output->info()->format()) + { + case Format::RGB888: + _func = colorconvert_nv12_to_rgb; + num_elems_processed_per_iteration = 32; + break; + case Format::RGBA8888: + _func = colorconvert_nv12_to_rgb; + num_elems_processed_per_iteration = 32; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::NV21: + { + switch(output->info()->format()) + { + case Format::RGB888: + _func = colorconvert_nv12_to_rgb; + num_elems_processed_per_iteration = 32; + break; + case Format::RGBA8888: + _func = colorconvert_nv12_to_rgb; + num_elems_processed_per_iteration = 32; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::IYUV: + { + switch(output->info()->format()) + { + case Format::RGB888: + _func = colorconvert_iyuv_to_rgb; + num_elems_processed_per_iteration = 32; + break; + case Format::RGBA8888: + _func = colorconvert_iyuv_to_rgb; + num_elems_processed_per_iteration = 32; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + win.set_dimension_step(Window::DimY, 2); + + unsigned int input_plane_count = 3; + + if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21) + { + input_plane_count = 2; + } + + AccessWindowHorizontal input0_access(input->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle input1_access(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f); + AccessWindowRectangle input2_access(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, 0.5f, 0.5f); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + input0_access, input1_access, input2_access, + output_access); + + ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), + input->plane(1)->info()->valid_region()); + + if(input_plane_count == 3) + { + intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region()); + } + + output_access.set_valid_region(win, intersect_region); + + INEKernel::configure(win); +} + +void NEColorConvertKernel::configure(const IImage *input, IMultiImage *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + + set_shape_if_empty(*output->plane(0)->info(), input->info()->tensor_shape()); + + switch(output->info()->format()) + { + case Format::NV12: + { + TensorShape subsampled_shape = input->info()->tensor_shape(); + subsampled_shape.set(0, subsampled_shape[0] / 2); + subsampled_shape.set(1, subsampled_shape[1] / 2); + + set_shape_if_empty(*output->plane(1)->info(), subsampled_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape()); + break; + } + case Format::IYUV: + { + TensorShape subsampled_shape = input->info()->tensor_shape(); + subsampled_shape.set(0, subsampled_shape[0] / 2); + subsampled_shape.set(1, subsampled_shape[1] / 2); + + set_shape_if_empty(*output->plane(1)->info(), subsampled_shape); + set_shape_if_empty(*output->plane(2)->info(), subsampled_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape()); + break; + } + case Format::YUV444: + set_shape_if_empty(*output->plane(1)->info(), input->info()->tensor_shape()); + set_shape_if_empty(*output->plane(2)->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(2)); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output->plane(0)); + + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->format()) + { + case Format::RGB888: + { + switch(output->info()->format()) + { + case Format::NV12: + _func = colorconvert_rgb_to_nv12; + num_elems_processed_per_iteration = 16; + break; + case Format::IYUV: + _func = colorconvert_rgb_to_iyuv; + num_elems_processed_per_iteration = 16; + break; + case Format::YUV444: + _func = colorconvert_rgb_to_yuv4; + num_elems_processed_per_iteration = 16; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::RGBA8888: + { + switch(output->info()->format()) + { + case Format::NV12: + _func = colorconvert_rgb_to_nv12; + num_elems_processed_per_iteration = 16; + break; + case Format::IYUV: + _func = colorconvert_rgb_to_iyuv; + num_elems_processed_per_iteration = 16; + break; + case Format::YUV444: + _func = colorconvert_rgb_to_yuv4; + num_elems_processed_per_iteration = 16; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::UYVY422: + { + switch(output->info()->format()) + { + case Format::NV12: + _func = colorconvert_yuyv_to_nv12; + num_elems_processed_per_iteration = 32; + break; + case Format::IYUV: + _func = colorconvert_yuyv_to_iyuv; + num_elems_processed_per_iteration = 32; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::YUYV422: + { + switch(output->info()->format()) + { + case Format::NV12: + _func = colorconvert_yuyv_to_nv12; + num_elems_processed_per_iteration = 32; + break; + case Format::IYUV: + _func = colorconvert_yuyv_to_iyuv; + num_elems_processed_per_iteration = 32; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + float sub_sampling = 1.f; + + if((input->info()->format() != Format::RGB888 || output->info()->format() != Format::YUV444) && (input->info()->format() != Format::RGBA8888 || output->info()->format() != Format::YUV444)) + { + win.set_dimension_step(Window::DimY, 2); + sub_sampling = 0.5f; + } + + unsigned int output_plane_count = 3; + + if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21) + { + output_plane_count = 2; + } + + AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); + AccessWindowRectangle output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, sub_sampling, sub_sampling); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output0_access, + output1_access, + output2_access); + + output0_access.set_valid_region(win, input->info()->valid_region()); + output1_access.set_valid_region(win, input->info()->valid_region()); + output2_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEColorConvertKernel::configure(const IMultiImage *input, IMultiImage *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_ERROR_ON(input == output); + + set_shape_if_empty(*output->plane(0)->info(), input->plane(0)->info()->tensor_shape()); + + switch(output->info()->format()) + { + case Format::NV12: + { + TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape(); + subsampled_shape.set(0, subsampled_shape[0] / 2); + subsampled_shape.set(1, subsampled_shape[1] / 2); + + set_shape_if_empty(*output->plane(1)->info(), subsampled_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape()); + break; + } + case Format::IYUV: + { + TensorShape subsampled_shape = input->plane(0)->info()->tensor_shape(); + subsampled_shape.set(0, subsampled_shape[0] / 2); + subsampled_shape.set(1, subsampled_shape[1] / 2); + + set_shape_if_empty(*output->plane(1)->info(), subsampled_shape); + set_shape_if_empty(*output->plane(2)->info(), subsampled_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(1)->info()->tensor_shape()); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(subsampled_shape, output->plane(2)->info()->tensor_shape()); + break; + } + case Format::YUV444: + set_shape_if_empty(*output->plane(1)->info(), input->plane(0)->info()->tensor_shape()); + set_shape_if_empty(*output->plane(2)->info(), input->plane(0)->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(2)); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input->plane(0), output->plane(0)); + + switch(input->info()->format()) + { + case Format::NV12: + { + switch(output->info()->format()) + { + case Format::IYUV: + _func = colorconvert_nv12_to_iyuv; + break; + case Format::YUV444: + _func = colorconvert_nv12_to_yuv4; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::NV21: + { + switch(output->info()->format()) + { + case Format::IYUV: + _func = colorconvert_nv12_to_iyuv; + break; + case Format::YUV444: + _func = colorconvert_nv12_to_yuv4; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + case Format::IYUV: + { + switch(output->info()->format()) + { + case Format::NV12: + _func = colorconvert_iyuv_to_nv12; + break; + case Format::YUV444: + _func = colorconvert_iyuv_to_yuv4; + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 32; + constexpr float input_sub_sampling = 0.5f; + const float output_sub_sampling = output->info()->format() == Format::YUV444 ? 1.f : 0.5f; + + // Configure kernel window + Window win = calculate_max_window(*input->plane(0)->info(), Steps(num_elems_processed_per_iteration)); + win.set_dimension_step(Window::DimY, 2); + + unsigned int input_plane_count = 3; + + if(input->info()->format() == Format::NV12 || input->info()->format() == Format::NV21) + { + input_plane_count = 2; + } + + unsigned int output_plane_count = 3; + + if(output->info()->format() == Format::NV12 || output->info()->format() == Format::NV21) + { + output_plane_count = 2; + } + + AccessWindowHorizontal output0_access(output->plane(0)->info(), 0, num_elems_processed_per_iteration); + AccessWindowRectangle output1_access(output->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling); + AccessWindowRectangle output2_access(output_plane_count == 2 ? nullptr : output->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, output_sub_sampling, output_sub_sampling); + + update_window_and_padding(win, + AccessWindowHorizontal(input->plane(0)->info(), 0, num_elems_processed_per_iteration), + AccessWindowRectangle(input->plane(1)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling), + AccessWindowRectangle(input_plane_count == 2 ? nullptr : input->plane(2)->info(), 0, 0, num_elems_processed_per_iteration, 1, input_sub_sampling, input_sub_sampling), + output0_access, + output1_access, + output2_access); + + ValidRegion intersect_region = intersect_valid_regions(input->plane(0)->info()->valid_region(), + input->plane(1)->info()->valid_region()); + + if(input_plane_count == 3) + { + intersect_region = intersect_valid_regions(intersect_region, input->plane(2)->info()->valid_region()); + } + + output0_access.set_valid_region(win, intersect_region); + output1_access.set_valid_region(win, intersect_region); + output2_access.set_valid_region(win, intersect_region); + + INEKernel::configure(win); +} + +void NEColorConvertKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _output, window); +} diff --git a/src/core/NEON/kernels/NEConvolutionKernel.cpp b/src/core/NEON/kernels/NEConvolutionKernel.cpp new file mode 100644 index 0000000000..30e91ef253 --- /dev/null +++ b/src/core/NEON/kernels/NEConvolutionKernel.cpp @@ -0,0 +1,1618 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include +#include +#include + +namespace arm_compute +{ +namespace +{ +const uint16x8_t max_int16 = vdupq_n_u16(INT16_MAX); + +inline void store_results(const int32x4_t &out, const int32x4_t &out2, int16_t *output) +{ + const int16x8_t s16results = vcombine_s16(vqmovn_s32(out), + vqmovn_s32(out2)); + vst1q_s16(output, s16results); +} + +inline void store_results(const int32x4_t &out, const int32x4_t &out2, uint8_t *output) +{ + const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovun_s32(out), + vqmovun_s32(out2))); + vst1_u8(output, u8results); +} + +inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, int16_t *output) +{ + const uint16x8_t u16results = vcombine_u16(vqmovn_u32(out), vqmovn_u32(out2)); + const int16x8_t s16results = vreinterpretq_s16_u16(vminq_u16(u16results, max_int16)); + vst1q_s16(output, s16results); +} + +inline void store_results(const uint32x4_t &out, const uint32x4_t &out2, uint8_t *output) +{ + const uint8x8_t u8results = vqmovn_u16(vcombine_u16(vqmovn_u32(out), + vqmovn_u32(out2))); + vst1_u8(output, u8results); +} + +inline void store_results(const int16x8_t &out, const int16x8_t &out2, int16_t *output) +{ + vst1q_s16(output, out); + vst1q_s16(output + 8, out2); +} + +inline void store_results(const int16x8_t &out, const int16x8_t &out2, uint8_t *output) +{ + const uint8x16_t u8results = vcombine_u8(vqmovun_s16(out), + vqmovun_s16(out2)); + vst1q_u8(output, u8results); +} + +inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, uint8_t *output) +{ + const uint8x16_t u8results = vcombine_u8(vqmovn_u16(out), + vqmovn_u16(out2)); + vst1q_u8(output, u8results); +} + +inline void store_results(const uint16x8_t &out, const uint16x8_t &out2, int16_t *output) +{ + vst1q_s16(output, vreinterpretq_s16_u16(vminq_u16(out, max_int16))); + vst1q_s16(output + 8, vreinterpretq_s16_u16(vminq_u16(out2, max_int16))); +} + +inline void convolve_row3x1_unrolled(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16x4_t &mat0, const int16x4_t &mat1, const int16x4_t &mat2) +{ + // Convert to s16 and split in blocks of 4 values: + const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); + const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); + + const int16x4x3_t row = + { + { + vget_low_s16(s16_tmp0), + vget_high_s16(s16_tmp0), + vget_low_s16(s16_tmp1) + } + }; + + // Calculate row left value for pixels [0,3] + out = vmlal_s16(out, row.val[0], mat0); + // Calculate row middle value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); + // Calculate row right value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); + + // Calculate row left value for pixels [4,7] + out2 = vmlal_s16(out2, row.val[1], mat0); + // Calculate row middle value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); + // Calculate row right value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); +} + +inline void convolve_row3x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) +{ + const int16x4_t mat0 = vld1_dup_s16(convolution); + const int16x4_t mat1 = vld1_dup_s16(convolution + 1); + const int16x4_t mat2 = vld1_dup_s16(convolution + 2); + + convolve_row3x1_unrolled(out, out2, row_data, mat0, mat1, mat2); +} + +inline void convolve_row5x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) +{ + const int16x4_t mat0 = vld1_dup_s16(convolution); + const int16x4_t mat1 = vld1_dup_s16(convolution + 1); + const int16x4_t mat2 = vld1_dup_s16(convolution + 2); + const int16x4_t mat3 = vld1_dup_s16(convolution + 3); + const int16x4_t mat4 = vld1_dup_s16(convolution + 4); + + // Convert to s16 and split in blocks of 4 values: + const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); + const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); + + const int16x4x3_t row = + { + { + vget_low_s16(s16_tmp0), + vget_high_s16(s16_tmp0), + vget_low_s16(s16_tmp1) + } + }; + + // Calculate row left 2 value for pixels [0,3] + out = vmlal_s16(out, row.val[0], mat0); + // Calculate row left 1 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); + // Calculate row middle value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); + // Calculate row right +1 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3); + // Calculate row right +2 value for pixels [0,3] + out = vmlal_s16(out, row.val[1], mat4); + + // Calculate row left 2 value for pixels [4,7] + out2 = vmlal_s16(out2, row.val[1], mat0); + // Calculate row left 1 value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); + // Calculate row middle value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); + // Calculate row right +1 value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3); + // Calculate row right +2 value for pixels [4,7] + out2 = vmlal_s16(out2, row.val[2], mat4); +} + +inline void convolve_row7x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) +{ + const int16x4_t mat0 = vld1_dup_s16(convolution); + const int16x4_t mat1 = vld1_dup_s16(convolution + 1); + const int16x4_t mat2 = vld1_dup_s16(convolution + 2); + const int16x4_t mat3 = vld1_dup_s16(convolution + 3); + const int16x4_t mat4 = vld1_dup_s16(convolution + 4); + const int16x4_t mat5 = vld1_dup_s16(convolution + 5); + const int16x4_t mat6 = vld1_dup_s16(convolution + 6); + + // Convert to s16 and split in blocks of 4 values: + const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); + const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); + + const int16x4x4_t row = + { + { + vget_low_s16(s16_tmp0), + vget_high_s16(s16_tmp0), + vget_low_s16(s16_tmp1), + vget_high_s16(s16_tmp1) + } + }; + + // Calculate row left 3 value for pixels [0,3] + out = vmlal_s16(out, row.val[0], mat0); + // Calculate row left 2 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); + // Calculate row left 1 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); + // Calculate row middle value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3); + // Calculate row right +1 value for pixels [0,3] + out = vmlal_s16(out, row.val[1], mat4); + // Calculate row right +2 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5); + // Calculate row right +3 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6); + + // Calculate row left 3 value for pixels [4,7] + out2 = vmlal_s16(out2, row.val[1], mat0); + // Calculate row left 2 value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); + // Calculate row left 1 value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); + // Calculate row middle value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3); + // Calculate row right +1 value for pixels [4,7] + out2 = vmlal_s16(out2, row.val[2], mat4); + // Calculate row right +2 value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5); + // Calculate row right +3 value for pixels [4,7] + out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6); +} + +inline void convolve_row9x1(int32x4_t &out, int32x4_t &out2, const uint8x16_t &row_data, const int16_t *convolution) +{ + const int16x4_t mat0 = vld1_dup_s16(convolution); + const int16x4_t mat1 = vld1_dup_s16(convolution + 1); + const int16x4_t mat2 = vld1_dup_s16(convolution + 2); + const int16x4_t mat3 = vld1_dup_s16(convolution + 3); + const int16x4_t mat4 = vld1_dup_s16(convolution + 4); + const int16x4_t mat5 = vld1_dup_s16(convolution + 5); + const int16x4_t mat6 = vld1_dup_s16(convolution + 6); + const int16x4_t mat7 = vld1_dup_s16(convolution + 7); + const int16x4_t mat8 = vld1_dup_s16(convolution + 8); + + // Convert to s16 and split in blocks of 4 values: + const int16x8_t s16_tmp0 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(row_data))); + const int16x8_t s16_tmp1 = vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(row_data))); + + const int16x4x4_t row = + { + { + vget_low_s16(s16_tmp0), + vget_high_s16(s16_tmp0), + vget_low_s16(s16_tmp1), + vget_high_s16(s16_tmp1) + } + }; + + // Calculate row left 4 value for pixels [0,3] + out = vmlal_s16(out, row.val[0], mat0); + // Calculate row left 3 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 1), mat1); + // Calculate row left 2 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 2), mat2); + // Calculate row left 1 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[0], row.val[1], 3), mat3); + // Calculate row middle value for pixels [0,3] + out = vmlal_s16(out, row.val[1], mat4); + // Calculate row right +1 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 1), mat5); + // Calculate row right +2 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 2), mat6); + // Calculate row right +3 value for pixels [0,3] + out = vmlal_s16(out, vext_s16(row.val[1], row.val[2], 3), mat7); + // Calculate row right +4 value for pixels [0,3] + out = vmlal_s16(out, row.val[2], mat8); + + // Calculate row left 4 value for pixels [0,3] + out2 = vmlal_s16(out2, row.val[1], mat0); + // Calculate row left 3 value for pixels [0,3] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 1), mat1); + // Calculate row left 2 value for pixels [0,3] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 2), mat2); + // Calculate row left 1 value for pixels [0,3] + out2 = vmlal_s16(out2, vext_s16(row.val[1], row.val[2], 3), mat3); + // Calculate row middle value for pixels [0,3] + out2 = vmlal_s16(out2, row.val[2], mat4); + // Calculate row right +1 value for pixels [0,3] + out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 1), mat5); + // Calculate row right +2 value for pixels [0,3] + out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 2), mat6); + // Calculate row right +3 value for pixels [0,3] + out2 = vmlal_s16(out2, vext_s16(row.val[2], row.val[3], 3), mat7); + // Calculate row right +4 value for pixels [0,3] + out2 = vmlal_s16(out2, row.val[3], mat8); +} +} // namespace + +/****************************************************************************************\ + * Square Convolution * +\****************************************************************************************/ + +template +NEConvolutionKernel::NEConvolutionKernel() + : INESimpleKernel(), _scale(0), _convolution{ {} } +{ +} + +template +BorderSize NEConvolutionKernel::border_size() const +{ + return BorderSize(matrix_size / 2); +} + +template +void NEConvolutionKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + + _input = input; + _output = output; + + std::copy_n(conv, _convolution.size(), _convolution.begin()); + + if(scale == 0) + { + _scale = calculate_matrix_scale(_convolution.data(), matrix_size); + } + else + { + _scale = scale; + } + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, matrix_size), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +template <> +template +void NEConvolutionKernel<3>::convolution(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + // Load the matrix's coefficients into NEON registers: + const int16x4_t mat00 = vld1_dup_s16(_convolution.data()); + const int16x4_t mat01 = vld1_dup_s16(_convolution.data() + 1); + const int16x4_t mat02 = vld1_dup_s16(_convolution.data() + 2); + const int16x4_t mat10 = vld1_dup_s16(_convolution.data() + 3); + const int16x4_t mat11 = vld1_dup_s16(_convolution.data() + 4); + const int16x4_t mat12 = vld1_dup_s16(_convolution.data() + 5); + const int16x4_t mat20 = vld1_dup_s16(_convolution.data() + 6); + const int16x4_t mat21 = vld1_dup_s16(_convolution.data() + 7); + const int16x4_t mat22 = vld1_dup_s16(_convolution.data() + 8); + const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); + + const unsigned char *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, -1)); + const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 0)); + const unsigned char *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-1, 1)); + + execute_window_loop(win, [&](const Coordinates & id) + { + int32x4_t out = vdupq_n_s32(0); + int32x4_t out2 = vdupq_n_s32(0); + + // Load 16 bytes from the top row: + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + convolve_row3x1_unrolled(out, out2, top_data, mat00, mat01, mat02); + + // Load 16 bytes from the middle row: + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + convolve_row3x1_unrolled(out, out2, mid_data, mat10, mat11, mat12); + + // Load 16 bytes from the middle row: + const uint8x16_t low_data = vld1q_u8(input_low_ptr + input.offset()); + convolve_row3x1_unrolled(out, out2, low_data, mat20, mat21, mat22); + + // Apply scale + if(_scale != 1) + { + // Convert to F32, scale and convert back to S32 + out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); + out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); + } + + // Clamp and store as U8 or S16: + store_results(out, out2, reinterpret_cast(output.ptr())); + }, + input, output); +} + +template <> +template +void NEConvolutionKernel<5>::convolution(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); + + const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -2)); + const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, -1)); + const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 0)); + const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 1)); + const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-2, 2)); + + execute_window_loop(win, [&](const Coordinates & id) + { + int32x4_t out = vdupq_n_s32(0); + int32x4_t out2 = vdupq_n_s32(0); + + // Load 16 bytes from the top2 row: + const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset()); + convolve_row5x1(out, out2, data_t2, _convolution.data()); + + // Load 16 bytes from the top1 row: + const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset()); + convolve_row5x1(out, out2, data_t1, _convolution.data() + 5); + + // Load 16 bytes from the middle row: + const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset()); + convolve_row5x1(out, out2, data_m, _convolution.data() + 10); + + // Load 16 bytes from the low1 row: + const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset()); + convolve_row5x1(out, out2, data_b1, _convolution.data() + 15); + + // Load 16 bytes from the low2 row: + const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset()); + convolve_row5x1(out, out2, data_b2, _convolution.data() + 20); + + // Apply scale + if(_scale != 1) + { + // Convert to F32, scale and convert back to S32 + out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); + out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); + } + + // Clamp and store as U8 or S16: + store_results(out, out2, reinterpret_cast(output.ptr())); + }, + input, output); +} + +template <> +template +void NEConvolutionKernel<7>::convolution(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); + + const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -3)); + const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -2)); + const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, -1)); + const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 0)); + const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 1)); + const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 2)); + const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-3, 3)); + + execute_window_loop(win, [&](const Coordinates & id) + { + int32x4_t out = vdupq_n_s32(0); + int32x4_t out2 = vdupq_n_s32(0); + + // Load 16 bytes from the top3 row: + const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset()); + convolve_row7x1(out, out2, data_t3, _convolution.data()); + + // Load 16 bytes from the top2 row: + const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset()); + convolve_row7x1(out, out2, data_t2, _convolution.data() + 7); + + // Load 16 bytes from the top1 row: + const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset()); + convolve_row7x1(out, out2, data_t1, _convolution.data() + 14); + + // Load 16 bytes from the middle row: + const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset()); + convolve_row7x1(out, out2, data_m, _convolution.data() + 21); + + // Load 16 bytes from the low1 row: + const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset()); + convolve_row7x1(out, out2, data_b1, _convolution.data() + 28); + + // Load 16 bytes from the low2 row: + const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset()); + convolve_row7x1(out, out2, data_b2, _convolution.data() + 35); + + // Load 16 bytes from the low3 row: + const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset()); + convolve_row7x1(out, out2, data_b3, _convolution.data() + 42); + + // Apply scale + if(_scale != 1) + { + // Convert to F32, scale and convert back to S32 + out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); + out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); + } + + // Clamp and store as U8 or S16: + store_results(out, out2, reinterpret_cast(output.ptr())); + }, + input, output); +} + +template <> +template +void NEConvolutionKernel<9>::convolution(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); + + const unsigned char *input_top4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -4)); + const unsigned char *input_top3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -3)); + const unsigned char *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -2)); + const unsigned char *input_top1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, -1)); + const unsigned char *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 0)); + const unsigned char *input_low1_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 1)); + const unsigned char *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 2)); + const unsigned char *input_low3_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 3)); + const unsigned char *input_low4_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-4, 4)); + + execute_window_loop(win, [&](const Coordinates & id) + { + int32x4_t out = vdupq_n_s32(0); + int32x4_t out2 = vdupq_n_s32(0); + + // Load 16 bytes from the top4 row: + const uint8x16_t data_t4 = vld1q_u8(input_top4_ptr + input.offset()); + convolve_row9x1(out, out2, data_t4, _convolution.data()); + + // Load 16 bytes from the top3 row: + const uint8x16_t data_t3 = vld1q_u8(input_top3_ptr + input.offset()); + convolve_row9x1(out, out2, data_t3, _convolution.data() + 9); + + // Load 16 bytes from the top2 row: + const uint8x16_t data_t2 = vld1q_u8(input_top2_ptr + input.offset()); + convolve_row9x1(out, out2, data_t2, _convolution.data() + 18); + + // Load 16 bytes from the top1 row: + const uint8x16_t data_t1 = vld1q_u8(input_top1_ptr + input.offset()); + convolve_row9x1(out, out2, data_t1, _convolution.data() + 27); + + // Load 16 bytes from the middle row: + const uint8x16_t data_m = vld1q_u8(input_mid_ptr + input.offset()); + convolve_row9x1(out, out2, data_m, _convolution.data() + 36); + + // Load 16 bytes from the low1 row: + const uint8x16_t data_b1 = vld1q_u8(input_low1_ptr + input.offset()); + convolve_row9x1(out, out2, data_b1, _convolution.data() + 45); + + // Load 16 bytes from the low2 row: + const uint8x16_t data_b2 = vld1q_u8(input_low2_ptr + input.offset()); + convolve_row9x1(out, out2, data_b2, _convolution.data() + 54); + + // Load 16 bytes from the low3 row: + const uint8x16_t data_b3 = vld1q_u8(input_low3_ptr + input.offset()); + convolve_row9x1(out, out2, data_b3, _convolution.data() + 63); + + // Load 16 bytes from the low4 row: + const uint8x16_t data_b4 = vld1q_u8(input_low4_ptr + input.offset()); + convolve_row9x1(out, out2, data_b4, _convolution.data() + 72); + + // Apply scale + if(_scale != 1) + { + // Convert to F32, scale and convert back to S32 + out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); + out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); + } + + // Clamp and store as U8 or S16: + store_results(out, out2, reinterpret_cast(output.ptr())); + }, + input, output); +} + +template +void NEConvolutionKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_output->info()->format()) + { + case Format::U8: + convolution(window); + break; + case Format::S16: + convolution(window); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} + +template class arm_compute::NEConvolutionKernel<3>; +template class arm_compute::NEConvolutionKernel<5>; +template class arm_compute::NEConvolutionKernel<7>; +template class arm_compute::NEConvolutionKernel<9>; + +/****************************************************************************************\ + * Separable Square Convolution * +\****************************************************************************************/ + +template +NESeparableConvolutionHorKernel::NESeparableConvolutionHorKernel() + : _conv_row{ { 0 } }, _border_size(0) +{ +} + +template +BorderSize NESeparableConvolutionHorKernel::border_size() const +{ + return _border_size; +} + +template +void NESeparableConvolutionHorKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_row); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U16, DataType::S16, DataType::S32); + + _input = input; + _output = output; + std::copy_n(conv_row, _conv_row.size(), _conv_row.begin()); + _border_size = BorderSize(border_undefined ? 0 : matrix_size / 2, matrix_size / 2); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +template +void NESeparableConvolutionHorKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + switch(_output->info()->data_type()) + { + case DataType::U16: + convolve(window); + break; + case DataType::S16: + convolve(window); + break; + case DataType::S32: + convolve(window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported intermediate data type!"); + break; + } +} + +template <> +template <> +inline void NESeparableConvolutionHorKernel<5>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -2); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const uint16x8x2_t data_u16 = + { + { + vmovl_u8(vget_low_u8(data)), + vmovl_u8(vget_high_u8(data)) + } + }; + + uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]); + + vst1q_u16(reinterpret_cast(output.ptr()), out); + }, + input, output); +} + +template <> +template <> +inline void NESeparableConvolutionHorKernel<5>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -2); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]); + + vst1q_s16(reinterpret_cast(output.ptr()), out); + }, + input, output); +} + +template <> +template <> +void NESeparableConvolutionHorKernel<5>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -2); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 1); + const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 2); + const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3); + const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 4); + + int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[1]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[2]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[3]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[4]); + + vst1q_s32(reinterpret_cast(output.ptr()), out_low); + + int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[1]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[2]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[3]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[4]); + + vst1q_s32(reinterpret_cast(output.ptr()) + 4, out_high); + }, + input, output); +} + +template <> +template <> +inline void NESeparableConvolutionHorKernel<7>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -3); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const uint16x8x2_t data_u16 = + { + { + vmovl_u8(vget_low_u8(data)), + vmovl_u8(vget_high_u8(data)) + } + }; + + uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]); + + vst1q_u16(reinterpret_cast(output.ptr()), out); + }, + input, output); +} + +template <> +template <> +inline void NESeparableConvolutionHorKernel<7>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -3); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]); + + vst1q_s16(reinterpret_cast(output.ptr()), out); + }, + input, output); +} + +template <> +template <> +void NESeparableConvolutionHorKernel<7>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -3); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 1); + const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 2); + const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 3); + const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 4); + const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 5); + const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 6); + + int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[1]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[2]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[3]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[4]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[5]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[6]); + + vst1q_s32(reinterpret_cast(output.ptr()), out_low); + + int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[1]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[2]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[3]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[4]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[5]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[6]); + + vst1q_s32(reinterpret_cast(output.ptr()) + 4, out_high); + }, + input, output); +} + +template <> +template <> +inline void NESeparableConvolutionHorKernel<9>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -4); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const uint16x8x2_t data_u16 = + { + { + vmovl_u8(vget_low_u8(data)), + vmovl_u8(vget_high_u8(data)) + } + }; + + uint16x8_t out = vmulq_n_u16(data_u16.val[0], _conv_row[0]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 1), _conv_row[1]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 2), _conv_row[2]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 3), _conv_row[3]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 4), _conv_row[4]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 5), _conv_row[5]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 6), _conv_row[6]); + out = vmlaq_n_u16(out, vextq_u16(data_u16.val[0], data_u16.val[1], 7), _conv_row[7]); + out = vmlaq_n_u16(out, data_u16.val[1], _conv_row[8]); + + vst1q_u16(reinterpret_cast(output.ptr()), out); + }, + input, output); +} + +template <> +template <> +inline void NESeparableConvolutionHorKernel<9>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -4); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + int16x8_t out = vmulq_n_s16(data_s16.val[0], _conv_row[0]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), _conv_row[1]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), _conv_row[2]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), _conv_row[3]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4), _conv_row[4]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 5), _conv_row[5]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 6), _conv_row[6]); + out = vmlaq_n_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 7), _conv_row[7]); + out = vmlaq_n_s16(out, data_s16.val[1], _conv_row[8]); + + vst1q_s16(reinterpret_cast(output.ptr()), out); + }, + input, output); +} + +template <> +template <> +void NESeparableConvolutionHorKernel<9>::convolve(const Window &window) +{ + Window win_in(window); + win_in.shift(Window::DimX, -4); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + const int16x8_t data_s16_l3 = vextq_s16(data_s16.val[0], data_s16.val[1], 1); + const int16x8_t data_s16_l2 = vextq_s16(data_s16.val[0], data_s16.val[1], 2); + const int16x8_t data_s16_l1 = vextq_s16(data_s16.val[0], data_s16.val[1], 3); + const int16x8_t data_s16_m = vextq_s16(data_s16.val[0], data_s16.val[1], 4); + const int16x8_t data_s16_r1 = vextq_s16(data_s16.val[0], data_s16.val[1], 5); + const int16x8_t data_s16_r2 = vextq_s16(data_s16.val[0], data_s16.val[1], 6); + const int16x8_t data_s16_r3 = vextq_s16(data_s16.val[0], data_s16.val[1], 7); + + int32x4_t out_low = vmull_n_s16(vget_low_s16(data_s16.val[0]), _conv_row[0]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l3), _conv_row[1]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l2), _conv_row[2]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_l1), _conv_row[3]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_m), _conv_row[4]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r1), _conv_row[5]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r2), _conv_row[6]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16_r3), _conv_row[7]); + out_low = vmlal_n_s16(out_low, vget_low_s16(data_s16.val[1]), _conv_row[8]); + + vst1q_s32(reinterpret_cast(output.ptr()), out_low); + + int32x4_t out_high = vmull_n_s16(vget_high_s16(data_s16.val[0]), _conv_row[0]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l3), _conv_row[1]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l2), _conv_row[2]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_l1), _conv_row[3]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_m), _conv_row[4]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r1), _conv_row[5]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r2), _conv_row[6]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16_r3), _conv_row[7]); + out_high = vmlal_n_s16(out_high, vget_high_s16(data_s16.val[1]), _conv_row[8]); + + vst1q_s32(reinterpret_cast(output.ptr()) + 4, out_high); + }, + input, output); +} + +template class arm_compute::NESeparableConvolutionHorKernel<5>; +template class arm_compute::NESeparableConvolutionHorKernel<7>; +template class arm_compute::NESeparableConvolutionHorKernel<9>; + +template +NESeparableConvolutionVertKernel::NESeparableConvolutionVertKernel() + : _conv_col{ { 0 } }, _scale(0) +{ +} + +template +BorderSize NESeparableConvolutionVertKernel::border_size() const +{ + return BorderSize(matrix_size / 2, 0); +} + +template +void NESeparableConvolutionVertKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv_col); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U16, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(scale == 0); + + _input = input; + _output = output; + std::copy_n(conv_col, _conv_col.size(), _conv_col.begin()); + _scale = scale; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 16; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, matrix_size), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +template +void NESeparableConvolutionVertKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_input->info()->data_type()) + { + case DataType::U16: + switch(_output->info()->data_type()) + { + case DataType::U8: + convolution_u16(window); + break; + case DataType::S16: + convolution_u16(window); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + break; + case DataType::S16: + switch(_output->info()->data_type()) + { + case DataType::U8: + convolution_s16(window); + break; + case DataType::S16: + convolution_s16(window); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + break; + case DataType::S32: + switch(_output->info()->data_type()) + { + case DataType::U8: + convolution_s32(window); + break; + case DataType::S16: + convolution_s32(window); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported intermediate data type!"); + break; + } +} + +template +template +void NESeparableConvolutionVertKernel::convolution_u16(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + + Window win_in(win); + win_in.set_dimension_step(Window::DimX, 8); + + Iterator in(_input, win_in); + Iterator out(_output, win); + + std::array input_ptrs{ {} }; + const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale); + const int k_half = matrix_size / 2; + + // Set row pointers + for(int i = -k_half; i <= k_half; ++i) + { + input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + uint16x8_t out0 = vdupq_n_u16(0); + uint16x8_t out1 = vdupq_n_u16(0); + + // First half + for(unsigned int r = 0; r < matrix_size; ++r) + { + const uint16x8_t data = vld1q_u16(reinterpret_cast(input_ptrs[r] + in.offset())); + out0 = vmlaq_n_u16(out0, data, _conv_col[r]); + } + + in.increment(Window::DimX); + + // Second half + for(unsigned int r = 0; r < matrix_size; ++r) + { + const uint16x8_t data = vld1q_u16(reinterpret_cast(input_ptrs[r] + in.offset())); + out1 = vmlaq_n_u16(out1, data, _conv_col[r]); + } + + //scale the result if needed + if(_scale != 1) + { + float32x4_t out0_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out0))); + float32x4_t out0_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out0))); + out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale); + out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale); + store_results(vcvtq_u32_f32(out0_f32_low), vcvtq_u32_f32(out0_f32_high), reinterpret_cast(out.ptr())); + + float32x4_t out1_f32_high = vcvtq_f32_u32(vmovl_u16(vget_high_u16(out1))); + float32x4_t out1_f32_low = vcvtq_f32_u32(vmovl_u16(vget_low_u16(out1))); + out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale); + out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale); + store_results(vcvtq_u32_f32(out1_f32_low), vcvtq_u32_f32(out1_f32_high), reinterpret_cast(out.ptr()) + 8); + } + else + { + store_results(out0, out1, reinterpret_cast(out.ptr())); + } + }, + in, out); +} + +template +template +void NESeparableConvolutionVertKernel::convolution_s16(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + + Window win_in(win); + win_in.set_dimension_step(Window::DimX, 8); + + Iterator in(_input, win_in); + Iterator out(_output, win); + + std::array input_ptrs{ {} }; + const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale); + const int k_half = matrix_size / 2; + + // Set row pointers + for(int i = -k_half; i <= k_half; ++i) + { + input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + int16x8_t out0 = vdupq_n_s16(0); + int16x8_t out1 = vdupq_n_s16(0); + + // First half + for(unsigned int r = 0; r < matrix_size; ++r) + { + const int16x8_t data = vld1q_s16(reinterpret_cast(input_ptrs[r] + in.offset())); + out0 = vmlaq_n_s16(out0, data, _conv_col[r]); + } + + in.increment(Window::DimX); + + // Second half + for(unsigned int r = 0; r < matrix_size; ++r) + { + const int16x8_t data = vld1q_s16(reinterpret_cast(input_ptrs[r] + in.offset())); + out1 = vmlaq_n_s16(out1, data, _conv_col[r]); + } + + //scale the result if needed + if(_scale != 1) + { + float32x4_t out0_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out0))); + float32x4_t out0_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out0))); + out0_f32_high = vmulq_f32(out0_f32_high, oneoverscale); + out0_f32_low = vmulq_f32(out0_f32_low, oneoverscale); + store_results(vcvtq_s32_f32(out0_f32_low), vcvtq_s32_f32(out0_f32_high), reinterpret_cast(out.ptr())); + + float32x4_t out1_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(out1))); + float32x4_t out1_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(out1))); + out1_f32_high = vmulq_f32(out1_f32_high, oneoverscale); + out1_f32_low = vmulq_f32(out1_f32_low, oneoverscale); + store_results(vcvtq_s32_f32(out1_f32_low), vcvtq_s32_f32(out1_f32_high), reinterpret_cast(out.ptr()) + 8); + } + else + { + store_results(out0, out1, reinterpret_cast(out.ptr())); + } + }, + in, out); +} + +template +template +void NESeparableConvolutionVertKernel::convolution_s32(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + + Window win_in(win); + win_in.set_dimension_step(Window::DimX, 8); + + Iterator in(_input, win_in); + Iterator out(_output, win); + + std::array input_ptrs{ {} }; + const float32x4_t oneoverscale = vdupq_n_f32(1.0f / _scale); + const int k_half = matrix_size / 2; + + // Set row pointers + for(int i = -k_half; i <= k_half; ++i) + { + input_ptrs[k_half + i] = _input->ptr_to_element(Coordinates(0, i)); + } + + const int32x4_t zero = vdupq_n_s32(0); + + execute_window_loop(win, [&](const Coordinates & id) + { + int32x4x2_t out0 = + { + { + zero, + zero + } + }; + + int32x4x2_t out1 = + { + { + zero, + zero + } + }; + + // First half + for(unsigned int r = 0; r < matrix_size; ++r) + { + const int32x4x2_t data = vld2q_s32(reinterpret_cast(input_ptrs[r] + in.offset())); + out0.val[0] = vmlaq_n_s32(out0.val[0], data.val[0], _conv_col[r]); + out0.val[1] = vmlaq_n_s32(out0.val[1], data.val[1], _conv_col[r]); + } + + in.increment(Window::DimX); + + // Second half + for(unsigned int r = 0; r < matrix_size; ++r) + { + const int32x4x2_t data = vld2q_s32(reinterpret_cast(input_ptrs[r] + in.offset())); + out1.val[0] = vmlaq_n_s32(out1.val[0], data.val[0], _conv_col[r]); + out1.val[1] = vmlaq_n_s32(out1.val[1], data.val[1], _conv_col[r]); + } + + //scale the result if needed + if(_scale != 1) + { + float32x4_t out0_f32_odd = vcvtq_f32_s32(out0.val[0]); + float32x4_t out0_f32_even = vcvtq_f32_s32(out0.val[1]); + out0_f32_odd = vmulq_f32(out0_f32_odd, oneoverscale); + out0_f32_even = vmulq_f32(out0_f32_even, oneoverscale); + out0.val[0] = vcvtq_s32_f32(out0_f32_odd); + out0.val[1] = vcvtq_s32_f32(out0_f32_even); + + float32x4_t out1_f32_odd = vcvtq_f32_s32(out1.val[0]); + float32x4_t out1_f32_even = vcvtq_f32_s32(out1.val[1]); + out1_f32_odd = vmulq_f32(out1_f32_odd, oneoverscale); + out1_f32_even = vmulq_f32(out1_f32_even, oneoverscale); + out1.val[0] = vcvtq_s32_f32(out1_f32_odd); + out1.val[1] = vcvtq_s32_f32(out1_f32_even); + } + + const int32x4x2_t out0_s32 = vzipq_s32(out0.val[0], out0.val[1]); + store_results(out0_s32.val[0], out0_s32.val[1], reinterpret_cast(out.ptr())); + + const int32x4x2_t out1_s32 = vzipq_s32(out1.val[0], out1.val[1]); + store_results(out1_s32.val[0], out1_s32.val[1], reinterpret_cast(out.ptr()) + 8); + }, + in, out); +} + +template class arm_compute::NESeparableConvolutionVertKernel<5>; +template class arm_compute::NESeparableConvolutionVertKernel<7>; +template class arm_compute::NESeparableConvolutionVertKernel<9>; + +/****************************************************************************************\ + * Rectangle Convolution * +\****************************************************************************************/ + +NEConvolutionRectangleKernel::NEConvolutionRectangleKernel() + : _input(nullptr), _output(nullptr), _scale(0), _convolution(), _border_size(), _func_idx(0) +{ +} + +BorderSize NEConvolutionRectangleKernel::border_size() const +{ + return _border_size; +} + +void NEConvolutionRectangleKernel::configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, conv); + + set_shape_if_empty(*output->info(), input->info()->tensor_shape()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(width != 3 && width != 5 && width != 7 && width != 9); + ARM_COMPUTE_ERROR_ON(height != 3 && height != 5 && height != 7 && height != 9); + ARM_COMPUTE_ERROR_ON(0 == scale); + + _input = input; + _output = output; + _scale = scale; + _border_size = BorderSize(height / 2, width / 2); + + // Setup the convolution matrix + const uint32_t nr_elements = width * height; + _convolution.resize(nr_elements); + std::copy_n(conv, nr_elements, _convolution.begin()); + + // Set function index to help choose appropriate function in run() + _func_idx = get_index(height) * 4 + get_index(width); + ARM_COMPUTE_ERROR_ON(_func_idx > (_nr_supported_sizes * _nr_supported_sizes)); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, _border_size); + AccessWindowHorizontal output_access = AccessWindowHorizontal(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, height), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, _border_size); + + INEKernel::configure(win); +} + +void NEConvolutionRectangleKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + using ConvolutionRectangleFunction = void (NEConvolutionRectangleKernel::*)(const Window & window); + + // uint8_t function table + static const std::array func_table_u8 = + { + { + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution + } + }; + // int16_t function table + static const std::array func_table_s16 = + { + { + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution, + &NEConvolutionRectangleKernel::convolution + } + }; + + // Run appropriate function + switch(_output->info()->format()) + { + case Format::U8: + ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_u8.size()); + (this->*func_table_u8[_func_idx])(window); + break; + case Format::S16: + ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_s16.size()); + (this->*func_table_s16[_func_idx])(window); + break; + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} + +unsigned int NEConvolutionRectangleKernel::get_index(uint32_t val) +{ + switch(val) + { + case 3: + return 0; + case 5: + return 1; + case 7: + return 2; + case 9: + return 3; + default: + ARM_COMPUTE_ERROR("Not supported dimension size"); + return 0; + } +} + +template +void NEConvolutionRectangleKernel::convolution(const Window &win) +{ + static_assert(sizeof(OutputType) == sizeof(uint8_t) || sizeof(OutputType) == sizeof(int16_t), "The output buffer can only be u8 or s16"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + std::array input_ptrs{ {} }; + const int16_t *conv = _convolution.data(); + const float32x4_t scale_val = vdupq_n_f32(1.0f / _scale); + const int k_row_half = rows / 2; + const int k_col_half = cols / 2; + + // Set row pointers + for(int i = -k_row_half; i <= k_row_half; ++i) + { + input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + int32x4_t out = vdupq_n_s32(0); + int32x4_t out2 = vdupq_n_s32(0); + + // Perform appropriate convolution + for(unsigned int r = 0; r < rows; ++r) + { + const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset()); + if(3 == cols) + { + convolve_row3x1(out, out2, data, conv + r * cols); + } + else if(5 == cols) + { + convolve_row5x1(out, out2, data, conv + r * cols); + } + else if(7 == cols) + { + convolve_row7x1(out, out2, data, conv + r * cols); + } + else if(9 == cols) + { + convolve_row9x1(out, out2, data, conv + r * cols); + } + else + { + ARM_COMPUTE_ERROR("Unsupported number of columns"); + } + } + + // Apply scale + if(_scale != 1) + { + // Convert to F32, scale and convert back to S32 + out = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out), scale_val)); + out2 = vcvtq_s32_f32(vmulq_f32(vcvtq_f32_s32(out2), scale_val)); + } + + // Clamp and store as U8 or S16: + store_results(out, out2, reinterpret_cast(output.ptr())); + }, + input, output); +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp new file mode 100644 index 0000000000..32789cbe33 --- /dev/null +++ b/src/core/NEON/kernels/NECumulativeDistributionKernel.cpp @@ -0,0 +1,110 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IDistribution1D.h" +#include "arm_compute/core/ILut.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +NECumulativeDistributionKernel::NECumulativeDistributionKernel() + : _input(nullptr), _distribution(nullptr), _cumulative_sum(nullptr), _output(nullptr) +{ +} + +bool NECumulativeDistributionKernel::is_parallelisable() const +{ + return false; +} + +void NECumulativeDistributionKernel::configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, distribution, cumulative_sum, output); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + + set_format_if_unknown(*input->info(), Format::U8); + + ARM_COMPUTE_ERROR_ON(distribution->num_bins() != cumulative_sum->num_bins()); + ARM_COMPUTE_ERROR_ON(distribution->num_bins() != output->num_elements()); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(input->info()->data_type() != output->type()); + + _input = input; + _distribution = distribution; + _cumulative_sum = cumulative_sum; + _output = output; + + INEKernel::configure(calculate_max_window(*input->info())); +} + +void NECumulativeDistributionKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_distribution->buffer() == nullptr); + ARM_COMPUTE_ERROR_ON(_cumulative_sum->buffer() == nullptr); + ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); + ARM_COMPUTE_ERROR_ON_MSG(_distribution->num_bins() < 256, "Distribution must have 256 bins"); + + // Calculate the cumulative distribution (summed histogram). + const uint32_t *hist = _distribution->buffer(); + uint32_t *cumulative_sum = _cumulative_sum->buffer(); + uint8_t *output = _output->buffer(); + + // Calculate cumulative distribution + std::partial_sum(hist, hist + _histogram_size, cumulative_sum); + + // Get the number of pixels that have the lowest value in the input image + const uint32_t cd_min = *std::find_if(hist, hist + _histogram_size, [](const uint32_t &v) + { + return v > 0; + }); + const uint32_t image_size = cumulative_sum[_histogram_size - 1]; + + ARM_COMPUTE_ERROR_ON(cd_min > image_size); + + // Create mapping lookup table + if(image_size == cd_min) + { + std::iota(output, output + _histogram_size, 0); + } + else + { + const float diff = image_size - cd_min; + + for(unsigned int x = 0; x < _histogram_size; ++x) + { + output[x] = lround((cumulative_sum[x] - cd_min) / diff * 255.0f); + } + } +} diff --git a/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp new file mode 100644 index 0000000000..902490ec38 --- /dev/null +++ b/src/core/NEON/kernels/NEDepthConcatenateKernel.cpp @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include + +using namespace arm_compute; + +NEDepthConcatenateKernel::NEDepthConcatenateKernel() + : _input(nullptr), _output(nullptr), _top_bottom(0), _left_right(0), _depth_offset(0) +{ +} + +BorderSize NEDepthConcatenateKernel::border_size() const +{ + return BorderSize(_top_bottom, _left_right); +} + +void NEDepthConcatenateKernel::configure(const ITensor *input, unsigned int depth_offset, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(2) + depth_offset > output->info()->dimension(2)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) > output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) > output->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(3, input, output); + + // The gaps between the two lowest dimensions of input and output need to be divisible by 2 + // Otherwise it is not clear how the padding should be added onto the input tensor + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) - input->info()->dimension(0)) % 2); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(1) - input->info()->dimension(1)) % 2); + + _input = input; + _output = output; + _depth_offset = depth_offset; + _left_right = (output->info()->dimension(0) - input->info()->dimension(0)) / 2; + _top_bottom = (output->info()->dimension(1) - input->info()->dimension(1)) / 2; + + const unsigned int num_elems_processed_per_iteration = 4; + const unsigned int num_elems_read_per_iteration = 4; + const unsigned int num_rows_read_per_iteration = 1; + + // The window needs to be based on input as we copy all the depths of input + Window win = calculate_max_enlarged_window(*input->info(), Steps(num_elems_processed_per_iteration), border_size()); + + AccessWindowRectangle input_access(input->info(), -_left_right, -_top_bottom, num_elems_read_per_iteration, num_rows_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEDepthConcatenateKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + // Offset output + const unsigned int offset_to_first_elements_in_bytes = _output->info()->offset_first_element_in_bytes() + _left_right * _output->info()->strides_in_bytes()[0] + _top_bottom * + _output->info()->strides_in_bytes()[1] + _depth_offset * _output->info()->strides_in_bytes()[2]; + uint8_t *output_ptr = _output->buffer() + offset_to_first_elements_in_bytes; + + Iterator input(_input, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output_ptr + output.offset()); + + vst1q_f32(out_ptr, vld1q_f32(in_ptr)); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NEDepthConvertKernel.cpp b/src/core/NEON/kernels/NEDepthConvertKernel.cpp new file mode 100644 index 0000000000..56612a7703 --- /dev/null +++ b/src/core/NEON/kernels/NEDepthConvertKernel.cpp @@ -0,0 +1,384 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEDepthConvertKernel::NEDepthConvertKernel() + : _policy(), _shift(0) +{ +} + +void NEDepthConvertKernel::configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::U16, DataType::U32, DataType::S32, DataType::F32); + ARM_COMPUTE_ERROR_ON(shift >= 8); + ARM_COMPUTE_ERROR_ON(input == output); + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == output->info()->data_type(), "Input and output data_types must be different"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::QS8 && (output->info()->data_type() != DataType::F32), + "Only data_types supported [in] QS8 -> [out] F32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U8 && (output->info()->data_type() != DataType::S16 && output->info()->data_type() != DataType::U16 + && output->info()->data_type() != DataType::S32), + "Only data_types supported [in] U8 -> [out] U16, S16, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::U16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::U32), + "Only data_types supported [in] U16 -> [out] U8, U32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::S16 && (output->info()->data_type() != DataType::U8 && output->info()->data_type() != DataType::S32), + "Only data_types supported [in] S16 -> [out] U8, S32"); + + ARM_COMPUTE_ERROR_ON_MSG(input->info()->data_type() == DataType::F32 && (output->info()->data_type() != DataType::QS8), + "Only data_types supported [in] F32 -> [out] QS8"); + + _policy = policy; + _shift = shift; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + INESimpleKernel::configure(input, output, num_elems_processed_per_iteration); +} + +void NEDepthConvertKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + ARM_COMPUTE_ERROR_ON(nullptr == _input); + ARM_COMPUTE_ERROR_ON(nullptr == _output); + ARM_COMPUTE_ERROR_ON(_input == _output); + + Iterator input(_input, window); + Iterator output(_output, window); + + switch(_input->info()->data_type()) + { + case DataType::QS8: + { + const int fixed_point_position = _input->info()->fixed_point_position(); + + switch(_output->info()->data_type()) + { + case DataType::F32: + { + /* Up-conversion QS8 -> F32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const int8x16_t texels_s8 = vld1q_s8(reinterpret_cast(input.ptr())); + + float32x4x2_t texels_low = vcvt_f32_qs8(vget_low_s8(texels_s8), fixed_point_position); + float32x4x2_t texels_high = vcvt_f32_qs8(vget_high_s8(texels_s8), fixed_point_position); + + vst1q_f32(reinterpret_cast(output.ptr()), texels_low.val[0]); + vst1q_f32(reinterpret_cast(output.ptr()) + 4, texels_low.val[1]); + vst1q_f32(reinterpret_cast(output.ptr()) + 8, texels_high.val[0]); + vst1q_f32(reinterpret_cast(output.ptr()) + 12, texels_high.val[1]); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::U8: + { + const int16x8_t b = vdupq_n_s16(_shift); + + switch(_output->info()->data_type()) + { + case DataType::S16: + { + /* Up-conversion U8 -> S16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + + const int16x8x2_t texels = + { + { + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) + } + }; + + vst1q_s16(reinterpret_cast(output.ptr()), texels.val[0]); + vst1q_s16(reinterpret_cast(output.ptr()) + 8, texels.val[1]); + }, + input, output); + break; + } + case DataType::S32: + { + /* Up-conversion U8 -> S32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + + const int16x8x2_t texels = + { + { + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(texels_u8))), b), + vshlq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(texels_u8))), b) + } + }; + + vst1q_s32(reinterpret_cast(output.ptr()), vmovl_s16(vget_low_s16(texels.val[0]))); + vst1q_s32(reinterpret_cast(output.ptr()) + 4, vmovl_s16(vget_high_s16(texels.val[0]))); + vst1q_s32(reinterpret_cast(output.ptr()) + 8, vmovl_s16(vget_low_s16(texels.val[1]))); + vst1q_s32(reinterpret_cast(output.ptr()) + 12, vmovl_s16(vget_high_s16(texels.val[1]))); + }, + input, output); + break; + } + case DataType::U16: + { + /* Up-conversion U8 -> U16 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t texels_u8 = vld1q_u8(input.ptr()); + + const uint16x8x2_t texels = + { + { + vshlq_u16(vmovl_u8(vget_low_u8(texels_u8)), b), + vshlq_u16(vmovl_u8(vget_high_u8(texels_u8)), b) + } + }; + + vst1q_u16(reinterpret_cast(output.ptr()), texels.val[0]); + vst1q_u16(reinterpret_cast(output.ptr()) + 8, texels.val[1]); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::S16: + { + switch(_output->info()->data_type()) + { + case DataType::U8: + { + const int16x8_t b = vdupq_n_s16(-static_cast(_shift)); + + /* Down-conversion S16 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vqshlq_s16(vld1q_s16(reinterpret_cast(input.ptr())), b), + vqshlq_s16(vld1q_s16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vqmovun_s16(texels.val[0]), vqmovun_s16(texels.val[1]))); + }, + input, output); + } + else + { + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vshlq_s16(vld1q_s16(reinterpret_cast(input.ptr())), b), + vshlq_s16(vld1q_s16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(vreinterpretq_u16_s16(texels.val[0])), + vmovn_u16(vreinterpretq_u16_s16(texels.val[1])))); + }, + input, output); + } + break; + } + case DataType::S32: + { + const int32x4_t b = vdupq_n_s32(_shift); + + /* Up-conversion S16 -> S32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t texels = + { + { + vld1q_s16(reinterpret_cast(input.ptr())), + vld1q_s16(reinterpret_cast(input.ptr()) + 8) + } + }; + + const int32x4x4_t texels_s32 = + { + { + vshlq_s32(vmovl_s16(vget_low_s16(texels.val[0])), b), + vshlq_s32(vmovl_s16(vget_high_s16(texels.val[0])), b), + vshlq_s32(vmovl_s16(vget_low_s16(texels.val[1])), b), + vshlq_s32(vmovl_s16(vget_high_s16(texels.val[1])), b) + } + }; + + vst1q_s32(reinterpret_cast(output.ptr()), texels_s32.val[0]); + vst1q_s32(reinterpret_cast(output.ptr()) + 4, texels_s32.val[1]); + vst1q_s32(reinterpret_cast(output.ptr()) + 8, texels_s32.val[2]); + vst1q_s32(reinterpret_cast(output.ptr()) + 12, texels_s32.val[3]); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::U16: + { + switch(_output->info()->data_type()) + { + case DataType::U8: + { + const int16x8_t b = vdupq_n_s16(-static_cast(_shift)); + + /* Down-conversion U16 -> U8 */ + if(ConvertPolicy::SATURATE == _policy) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x8x2_t texels = + { + { + vqshlq_u16(vld1q_u16(reinterpret_cast(input.ptr())), b), + vqshlq_u16(vld1q_u16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vqmovn_u16(texels.val[0]), vqmovn_u16(texels.val[1]))); + }, + input, output); + } + else + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x8x2_t texels = + { + { + vshlq_u16(vld1q_u16(reinterpret_cast(input.ptr())), b), + vshlq_u16(vld1q_u16(reinterpret_cast(input.ptr()) + 8), b) + } + }; + + vst1q_u8(output.ptr(), vcombine_u8(vmovn_u16(texels.val[0]), vmovn_u16(texels.val[1]))); + }, + input, output); + } + break; + } + case DataType::U32: + { + const int32x4_t b = vdupq_n_s32(_shift); + + /* Up-conversion U16 -> U32 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x8x2_t texels = + { + { + vld1q_u16(reinterpret_cast(input.ptr())), + vld1q_u16(reinterpret_cast(input.ptr()) + 8) + } + }; + + vst1q_u32(reinterpret_cast(output.ptr()), vshlq_u32(vmovl_u16(vget_low_u16(texels.val[0])), b)); + vst1q_u32(reinterpret_cast(output.ptr()) + 4, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[0])), b)); + vst1q_u32(reinterpret_cast(output.ptr()) + 8, vshlq_u32(vmovl_u16(vget_low_u16(texels.val[1])), b)); + vst1q_u32(reinterpret_cast(output.ptr()) + 12, vshlq_u32(vmovl_u16(vget_high_u16(texels.val[1])), b)); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + case DataType::F32: + { + switch(_output->info()->data_type()) + { + case DataType::QS8: + { + const int fixed_point_position = _output->info()->fixed_point_position(); + /* Down-conversion F32 -> QS8 */ + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x4_t texels_f32 = + { + { + vld1q_f32(reinterpret_cast(input.ptr())), + vld1q_f32(reinterpret_cast(input.ptr()) + 4), + vld1q_f32(reinterpret_cast(input.ptr()) + 8), + vld1q_f32(reinterpret_cast(input.ptr()) + 12) + } + }; + + const qint8x16_t texels_s8 = vcvtq_qs8_f32(texels_f32, fixed_point_position); + + vst1q_s8(reinterpret_cast(output.ptr()), texels_s8); + }, + input, output); + break; + } + default: + ARM_COMPUTE_ERROR("Output data type not supported"); + } + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + } +} diff --git a/src/core/NEON/kernels/NEDerivativeKernel.cpp b/src/core/NEON/kernels/NEDerivativeKernel.cpp new file mode 100644 index 0000000000..bf7e0972d5 --- /dev/null +++ b/src/core/NEON/kernels/NEDerivativeKernel.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEDerivativeKernel::NEDerivativeKernel() + : _func(nullptr), _input(nullptr), _output_x(nullptr), _output_y(nullptr) +{ +} + +BorderSize NEDerivativeKernel::border_size() const +{ + return BorderSize(1); +} + +void NEDerivativeKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + const bool run_der_x = output_x != nullptr; + const bool run_der_y = output_y != nullptr; + + if(run_der_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(run_der_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + + AccessWindowHorizontal out_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal out_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal in_x_access(input->info(), -border_size().left, num_elems_processed_per_iteration); + AccessWindowRectangle in_y_access(input->info(), 0, -border_size().left, num_elems_processed_per_iteration, num_rows_read_per_iteration); + AccessWindowRectangle in_xy_access(input->info(), -border_size().left, -border_size().top, num_elems_processed_per_iteration, num_rows_read_per_iteration); + + if(run_der_x && run_der_y) + { + _func = &NEDerivativeKernel::derivative_xy; + update_window_and_padding(win, in_xy_access, out_x_access, out_y_access); + out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + } + else + { + if(run_der_x) + { + _func = &NEDerivativeKernel::derivative_x; + update_window_and_padding(win, in_x_access, out_x_access); + out_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + } + else if(run_der_y) + { + _func = &NEDerivativeKernel::derivative_y; + update_window_and_padding(win, in_y_access, out_y_access); + out_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + } + else + { + ARM_COMPUTE_ERROR("At least one output must be NOT NULL"); + } + } + + INEKernel::configure(win); +} + +void NEDerivativeKernel::derivative_x(const Window &window) +{ + Iterator in(_input, window); + Iterator out_x(_output_x, window); + + /* Apply 1-D centered point discrete derivative mask ([-1 0 1]) along the X direction */ + execute_window_loop(window, [&](const Coordinates & id) + { + /* Load left and right data */ + const uint8x16_t l_data = vld1q_u8(in.ptr() - 1); + const uint8x16_t r_data = vld1q_u8(in.ptr() + 1); + + /* Cast to int16 and perform the subtraction between the right and left data */ + const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data)))); + + /* Cast to int16 and perform the subtraction between the right and left data */ + const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data)))); + + /* Store result of derivative along the X direction */ + vst1q_s16(reinterpret_cast(out_x.ptr()), out0); + vst1q_s16(reinterpret_cast(out_x.ptr()) + 8, out1); + }, + in, out_x); +} + +void NEDerivativeKernel::derivative_y(const Window &window) +{ + Iterator in(_input, window); + Iterator out_y(_output_y, window); + + const size_t stride = _input->info()->strides_in_bytes()[1]; + + /* Apply 1-D centered point discrete derivative mask ([-1 0 1]^T) along the Y direction */ + execute_window_loop(window, [&](const Coordinates & id) + { + /* Load top and bottom data */ + const uint8x16_t t_data = vld1q_u8(in.ptr() - stride); + const uint8x16_t b_data = vld1q_u8(in.ptr() + stride); + + /* Cast to int16 and perform the subtraction between the bottom and top data */ + const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data)))); + + /* Cast to int16 and perform the subtraction between the bottom and top data */ + const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data)))); + + /* Store result of derivative along the Y direction */ + vst1q_s16(reinterpret_cast(out_y.ptr()), out0); + vst1q_s16(reinterpret_cast(out_y.ptr()) + 8, out1); + }, + in, out_y); +} + +void NEDerivativeKernel::derivative_xy(const Window &window) +{ + Iterator in(_input, window); + Iterator out_x(_output_x, window); + Iterator out_y(_output_y, window); + + const size_t stride = _input->info()->strides_in_bytes()[1]; + + /* Apply 1-D centered point discrete derivative masks ([-1 0 1] and [-1 0 1]^T) along the X and Y directions */ + execute_window_loop(window, [&](const Coordinates & id) + { + /* Load top, bottom, left and right data */ + const uint8x16_t t_data = vld1q_u8(in.ptr() - stride); + const uint8x16_t b_data = vld1q_u8(in.ptr() + stride); + const uint8x16_t l_data = vld1q_u8(in.ptr() - 1); + const uint8x16_t r_data = vld1q_u8(in.ptr() + 1); + + /* Cast to int16 and perform the subtraction between the bottom and top data */ + const int16x8_t out0 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(b_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(t_data)))); + + /* Cast to int16 and perform the subtraction between the bottom and top data */ + const int16x8_t out1 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(b_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(t_data)))); + + /* Cast to int16 and perform the subtraction between the right and left data */ + const int16x8_t out2 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(r_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(l_data)))); + + /* Cast to int16 and perform the subtraction between the right and left data */ + const int16x8_t out3 = vsubq_s16(vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(r_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(l_data)))); + + /* Store result of derivative along the Y direction */ + vst1q_s16(reinterpret_cast(out_y.ptr()), out0); + vst1q_s16(reinterpret_cast(out_y.ptr()) + 8, out1); + + /* Store result of derivative along the X direction */ + vst1q_s16(reinterpret_cast(out_x.ptr()), out2); + vst1q_s16(reinterpret_cast(out_x.ptr()) + 8, out3); + }, + in, out_x, out_y); +} + +void NEDerivativeKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEDilateKernel.cpp b/src/core/NEON/kernels/NEDilateKernel.cpp new file mode 100644 index 0000000000..867cf77c49 --- /dev/null +++ b/src/core/NEON/kernels/NEDilateKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDilateKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +BorderSize NEDilateKernel::border_size() const +{ + return BorderSize(1); +} + +void NEDilateKernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEDilateKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator in(_input, window); + Iterator out(_output, window); + + const size_t in_stride = _input->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates &) + { + uint8_t *in_ptr = in.ptr() - 1; + const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride); + const uint8x16_t mid_data = vld1q_u8(in_ptr); + const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride); + + uint8x8_t top_high_data = vget_high_u8(top_data); + uint8x8_t top_low_data = vget_low_u8(top_data); + + uint8x8_t mid_high_data = vget_high_u8(mid_data); + uint8x8_t mid_low_data = vget_low_u8(mid_data); + + uint8x8_t bot_high_data = vget_high_u8(bot_data); + uint8x8_t bot_low_data = vget_low_u8(bot_data); + + uint8x8_t p0, p1; + + p0 = top_low_data; + p1 = vext_u8(top_low_data, top_high_data, 1); + p0 = vmax_u8(p0, p1); + + p1 = vext_u8(top_low_data, top_high_data, 2); + p0 = vmax_u8(p0, p1); + + p1 = mid_low_data; + p0 = vmax_u8(p0, p1); + + p1 = vext_u8(mid_low_data, mid_high_data, 1); + p0 = vmax_u8(p0, p1); + + p1 = vext_u8(mid_low_data, mid_high_data, 2); + p0 = vmax_u8(p0, p1); + + p1 = bot_low_data; + p0 = vmax_u8(p0, p1); + + p1 = vext_u8(bot_low_data, bot_high_data, 1); + p0 = vmax_u8(p0, p1); + + p1 = vext_u8(bot_low_data, bot_high_data, 2); + p0 = vmax_u8(p0, p1); + + vst1_u8(out.ptr(), p0); + }, + in, out); +} diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp new file mode 100644 index 0000000000..effc50e7c0 --- /dev/null +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.cpp @@ -0,0 +1,207 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace +{ +// Internal load +inline float32x4_t internal_vld1q(const float *in) +{ + return vld1q_f32(in); +} +inline qint8x16_t internal_vld1q(const qint8_t *in) +{ + return vld1q_qs8(in); +} +inline qint16x8_t internal_vld1q(const qint16_t *in) +{ + return vld1q_qs16(in); +} + +// Internal store +inline void internal_vst1q(float *p, const float32x4_t &v) +{ + vst1q_f32(p, v); +} +inline void internal_vst1q(qint8_t *p, const qint8x16_t &v) +{ + vst1q_qs8(p, v); +} +inline void internal_vst1q(qint8_t *p, const qint16x8_t &v) +{ + vst1_qs8(p, vqmovn_s16(v)); +} +inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) +{ + vst1q_qs16(p, v); +} + +// Internal vdup +inline float32x4_t internal_vdupq_n(float v) +{ + return vdupq_n_f32(v); +} +inline qint8x16_t internal_vdupq_n(qint8_t v) +{ + return vdupq_n_qs8(v); +} +inline qint16x8_t internal_vdupq_n(qint16_t v) +{ + return vdupq_n_qs16(v); +} + +// Internal vadd +inline float32x4_t internal_vqaddq(const float32x4_t &x, const float32x4_t &y) +{ + return vaddq_f32(x, y); +} +inline qint8x16_t internal_vqaddq(const qint8x16_t &x, const qint8x16_t &y) +{ + return vqaddq_qs8(x, y); +} +inline qint16x8_t internal_vqaddq(const qint16x8_t &x, const qint16x8_t &y) +{ + return vqaddq_qs16(x, y); +} + +template +void accumulate_bias(ITensor *input, const ITensor *bias, const Window window, ITensor *output) +{ + Iterator in(input, window); + + if(in_place) // In place accumulate + { + execute_window_loop(window, [&](const Coordinates & id) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto vb = internal_vdupq_n(static_cast(*reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))))); + + // Accumulate bias + internal_vst1q(in_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb)); + }, + in); + } + else // Out of place accumulate + { + Iterator out(output, window); + execute_window_loop(window, [&](const Coordinates & id) + { + // Get bias and pointer to input + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); + const auto vb = internal_vdupq_n(static_cast(*reinterpret_cast(bias->ptr_to_element(Coordinates(id.z()))))); + + // Accumulate bias + internal_vst1q(out_ptr, internal_vqaddq(internal_vld1q(in_ptr), vb)); + }, + in, out); + } +} +} // namespace + +NEDirectConvolutionLayerBiasAccumulateKernel::NEDirectConvolutionLayerBiasAccumulateKernel() + : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr) +{ +} + +void NEDirectConvolutionLayerBiasAccumulateKernel::configure(ITensor *input, const ITensor *bias, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON(input->info()->fixed_point_position() != bias->info()->fixed_point_position()); + if(output != nullptr) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(bias, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(bias, output); + } + ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); + + _func = nullptr; + _bias = bias; + _input = input; + _output = output; + + const unsigned int num_elems_processed_per_iteration = 16 / element_size_from_data_type(input->info()->data_type()); + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic bias_access(bias->info(), 0, 0, bias->info()->dimension(0), bias->info()->dimension(1)); + if(output != nullptr) + { + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, input_access, output_access, bias_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + } + else + { + update_window_and_padding(win, input_access, bias_access); + input_access.set_valid_region(win, ValidRegion(Coordinates(), input->info()->tensor_shape())); + } + INEKernel::configure(win); + + // Set appropriate function + if(input->info()->data_type() == DataType::F32) + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + } + else if(input->info()->data_type() == DataType::QS8) + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + } + else if(input->info()->data_type() == DataType::QS16 && bias->info()->data_type() == DataType::QS8) + { + _func = (output == nullptr) ? &accumulate_bias : &accumulate_bias; + } + else + { + ARM_COMPUTE_ERROR("Unsupported combination of types among the inputs."); + } +} + +void NEDirectConvolutionLayerBiasAccumulateKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _bias, window, _output); +} diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp new file mode 100644 index 0000000000..d6088981aa --- /dev/null +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -0,0 +1,817 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace +{ +template +float32x4_t internal_vld1q(const float *in); + +template <> +float32x4_t internal_vld1q<1>(const float *in) +{ + return vld1q_f32(in); +} + +template <> +float32x4_t internal_vld1q<2>(const float *in) +{ + const float32x4x2_t tmp = vld2q_f32(in); + return tmp.val[0]; +} + +template <> +float32x4_t internal_vld1q<3>(const float *in) +{ + const float32x4x3_t tmp = vld3q_f32(in); + return tmp.val[0]; +} + +template +qint8x8_t internal_vld1q(const qint8_t *in); + +template <> +qint8x8_t internal_vld1q<1>(const qint8_t *in) +{ + return vld1_qs8(in); +} + +template <> +qint8x8_t internal_vld1q<2>(const qint8_t *in) +{ + const qint8x8x2_t tmp = vld2_s8(in); + return tmp.val[0]; +} + +template <> +qint8x8_t internal_vld1q<3>(const qint8_t *in) +{ + const qint8x8x3_t tmp = vld3_s8(in); + return tmp.val[0]; +} + +template +qint16x8_t internal_vld1q(const qint16_t *in); + +template <> +qint16x8_t internal_vld1q<1>(const qint16_t *in) +{ + return vld1q_s16(in); +} + +inline float32x4_t internal_vdupq_n(float v) +{ + return vdupq_n_f32(v); +} + +inline qint8x8_t internal_vdupq_n(qint8_t v) +{ + return vdup_n_qs8(v); +} + +inline void internal_vst1q(float *p, const float32x4_t &v) +{ + vst1q_f32(p, v); +} + +inline void internal_vst1q(qint16_t *p, const qint16x8_t &v) +{ + vst1q_qs16(p, v); +} + +float32x4_t internal_vmull(const float32x4_t &x, const float32x4_t &y, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + return vmulq_f32(x, y); +} + +qint16x8_t internal_vmull(const qint8x8_t &x, const qint8x8_t &y, int fixed_point_position) +{ + return vmull_qs8(x, y, fixed_point_position); +} + +inline float32x4_t internal_vmlal(const float32x4_t &x, const float32x4_t &y, const float32x4_t &z, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + return vmlaq_f32(x, y, z); +} + +inline qint16x8_t internal_vmlal(const qint16x8_t &x, const qint8x8_t &y, const qint8x8_t &z, int fixed_point_position) +{ + return vqmlal_qs8(x, y, z, fixed_point_position); +} + +template +class convolver_1x1 +{ +public: + static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + { + const int input_stride_y = input->info()->strides_in_bytes().y(); + const int input_stride_z = input->info()->strides_in_bytes().z(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + const int output_stride_z = output->info()->strides_in_bytes().z(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = output->info()->dimension(0); + const int output_h = output->info()->dimension(1); + const int range_z = window.z().end() - window.z().start(); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const int fixed_point_position = input->info()->fixed_point_position(); + + // setup output window for the iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); + window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), range_z)); + + // setup input window for the iterator + Window window_in = window; + // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); + + Iterator out(output, window_out); + Iterator in(input, window_in); + Iterator k(weights, window_k); + + const uint8_t *k_ptr = k.ptr(); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + /* + For a detailed explanation on how the algorithm works refer to template <> class convolver_3x3<1> + */ + const uint8_t *input_ptr = in.ptr(); + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + for(int oz = 0; oz < range_z; ++oz) + { + auto p_out_base = out_ptr + oz * output_stride_z; + // Step 1 + { + const auto k_val = reinterpret_cast(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w); + const auto vk = internal_vdupq_n(*k_val); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + const int offset_xy = ih * input_stride_y; + auto in_val = reinterpret_cast(input_ptr + (0 * input_stride_z + offset_xy)); + auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) + { + internal_vst1q(p_out, internal_vmull(vk, internal_vld1q(in_val), fixed_point_position)); + } + } + } + // Step 2 + for(int p = 1; p < kernel_depth; ++p) + { + const auto k_val = reinterpret_cast(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w); + const auto vk = internal_vdupq_n(*k_val); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + const int offset_xy = ih * input_stride_y; + auto in_val = reinterpret_cast(input_ptr + p * input_stride_z + offset_xy); + auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, in_val += num_elems_read_per_iteration, p_out += num_elems_written_per_iteration) + { + internal_vst1q(p_out, internal_vmlal(internal_vld1q<1>(p_out), vk, internal_vld1q(in_val), fixed_point_position)); + } + } + } + } + }, + in, out); + } +}; + +inline float32x4x3_t load_matrix_row(const float *ptr) +{ + const float32x4x3_t r = + { + { + vld1q_dup_f32(ptr), + vld1q_dup_f32(1 + ptr), + vld1q_dup_f32(2 + ptr) + } + }; + return r; +} +inline qint8x8x3_t load_matrix_row(const qint8_t *ptr) +{ + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + const qint8x8x3_t r = + { + { + vld1_dup_qs8(ptr), + vld1_dup_qs8(1 + ptr), + vld1_dup_qs8(2 + ptr) + } + }; + return r; +} + +template +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position); + +template <> +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + 4), + vld1q_f32(in_top + 8) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + 4), + vld1q_f32(in_mid + 8) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + 4), + vld1q_f32(in_low + 8) + } + }; + float32x4x2_t out = + { + { + vmulq_f32(vtop.val[0], m0.val[0]), + vmulq_f32(vtop.val[1], m0.val[0]) + } + }; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); + out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, int fixed_point_position) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + return out; +} + +template +qint16x8x2_t convolve_3x3(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position); + +template <> +inline qint16x8x2_t convolve_3x3<1>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +{ + ARM_COMPUTE_UNUSED(fixed_point_position); + + const qint8x8x3_t vtop = + { + { + vld1_qs8(in_top), + vld1_qs8(in_top + 8), + vld1_qs8(in_top + 16) + } + }; + const qint8x8x3_t vmid = + { + { + vld1_qs8(in_mid), + vld1_qs8(in_mid + 8), + vld1_qs8(in_mid + 16) + } + }; + const qint8x8x3_t vlow = + { + { + vld1_qs8(in_low), + vld1_qs8(in_low + 8), + vld1_qs8(in_low + 16) + } + }; + qint16x8x2_t out = + { + { + vmull_qs8(vtop.val[0], m0.val[0], fixed_point_position), + vmull_qs8(vtop.val[1], m0.val[0], fixed_point_position) + } + }; + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 1), m0.val[1], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vtop.val[0], vtop.val[1], 2), m0.val[2], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vmid.val[0], m1.val[0], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 1), m1.val[1], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vmid.val[0], vmid.val[1], 2), m1.val[2], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vlow.val[0], m2.val[0], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 1), m2.val[1], fixed_point_position); + out.val[0] = vqmlal_qs8(out.val[0], vext_s8(vlow.val[0], vlow.val[1], 2), m2.val[2], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 1), m0.val[1], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vtop.val[1], vtop.val[2], 2), m0.val[2], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vmid.val[1], m1.val[0], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 1), m1.val[1], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vmid.val[1], vmid.val[2], 2), m1.val[2], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vlow.val[1], m2.val[0], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 1), m2.val[1], fixed_point_position); + out.val[1] = vqmlal_qs8(out.val[1], vext_s8(vlow.val[1], vlow.val[2], 2), m2.val[2], fixed_point_position); + return out; +} + +template <> +inline qint16x8x2_t convolve_3x3<2>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +{ + qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 4), out.val[0], 2); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 3); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 0), out.val[0], 4); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 2), out.val[0], 5); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 4), out.val[0], 6); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 6), out.val[0], 7); + return out; +} + +template <> +inline qint16x8x2_t convolve_3x3<3>(const qint8_t *in_top, const qint8_t *in_mid, const qint8_t *in_low, const qint8x8x3_t &m0, const qint8x8x3_t &m1, const qint8x8x3_t &m2, int fixed_point_position) +{ + qint16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2, fixed_point_position); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_s16(vgetq_lane_s16(out.val[1], 1), out.val[0], 3); + return out; +} + +template +void store_results(float *buffer, const float32x4x2_t &values); + +template <> +void store_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); + vst1q_f32(buffer + 4, values.val[1]); +} + +template <> +void store_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); +} + +template <> +void store_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vget_low_f32(values.val[0])); +} + +template +void store_results(qint16_t *buffer, const qint16x8x2_t &values); + +template <> +void store_results<1>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1q_qs16(buffer, values.val[0]); + vst1q_qs16(buffer + 8, values.val[1]); +} + +template <> +void store_results<2>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1q_qs16(buffer, values.val[0]); +} + +template <> +void store_results<3>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1_qs16(buffer, vget_low_s16(values.val[0])); +} + +template +void accumulate_results(float *buffer, const float32x4x2_t &values); + +template <> +void accumulate_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); + vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1])); +} + +template <> +void accumulate_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); +} + +template <> +void accumulate_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); +} + +template +void accumulate_results(qint16_t *buffer, const qint16x8x2_t &values); + +template <> +void accumulate_results<1>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0])); + vst1q_qs16(buffer + 8, vqaddq_qs16(vld1q_qs16(buffer + 8), values.val[1])); +} + +template <> +void accumulate_results<2>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1q_qs16(buffer, vqaddq_qs16(vld1q_qs16(buffer), values.val[0])); +} + +template <> +void accumulate_results<3>(qint16_t *buffer, const qint16x8x2_t &values) +{ + vst1_qs16(buffer, vqadd_qs16(vld1_qs16(buffer), vget_low_s16(values.val[0]))); +} + +template +int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration); + +template <> +int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration; +} + +template <> +int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration << 1; +} + +template <> +int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration * 3; +} + +template +class convolver_3x3 +{ +public: + static void convolve(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) + { + ARM_COMPUTE_UNUSED(num_elems_read_per_iteration); + const int input_stride_x = input->info()->strides_in_bytes().x(); + const int input_stride_y = input->info()->strides_in_bytes().y(); + const int input_stride_z = input->info()->strides_in_bytes().z(); + const int output_stride_y = output->info()->strides_in_bytes().y(); + const int output_stride_z = output->info()->strides_in_bytes().z(); + const int kernel_stride_x = weights->info()->strides_in_bytes().x(); + const int kernel_stride_y = weights->info()->strides_in_bytes().y(); + const int kernel_stride_z = weights->info()->strides_in_bytes().z(); + const int kernel_stride_w = weights->info()->strides_in_bytes()[3]; + const int output_w = output->info()->dimension(0); + const int output_h = output->info()->dimension(1); + const int num_planes_z = window.z().end() - window.z().start(); + const int delta_input = get_input_num_elems_processed(num_elems_written_per_iteration); + const int kernel_depth = weights->info()->dimension(Window::DimZ); + const unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); + const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); + const int fixed_point_position = input->info()->fixed_point_position(); + + // setup output window for the iterator + Window window_out = window; + window_out.set(Window::DimX, Window::Dimension(0, output->info()->dimension(Window::DimX), output->info()->dimension(Window::DimX))); + window_out.set(Window::DimY, Window::Dimension(0, output->info()->dimension(Window::DimY), output->info()->dimension(Window::DimY))); + window_out.set(Window::DimZ, Window::Dimension(window.z().start(), window.z().end(), num_planes_z)); + + // setup input window for the iterator + Window window_in = window; + // we just want execute_window_loop to iterate over the higher dimensions (>3), so we set the first 3 dimensions to 0 + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window window_k = calculate_max_window(*weights->info(), Steps(1u)); + + Iterator out(output, window_out); + Iterator in(input, window_in); + Iterator k(weights, window_k); + + const uint8_t *k_ptr = k.ptr(); + + execute_window_loop(window_out, [&](const Coordinates & id) + { + const uint8_t *input_ptr = in.ptr() - conv_pad_x * input_stride_x - conv_pad_y * input_stride_y; + uint8_t *out_ptr = out.ptr(); + int ih = 0; + int oh = 0; + /* + Each thread executing this kernel computes one or more output's volume planes. + + Let's say the 3rd dimension of the output volume is 32, the first thread will compute the output for Z = [0,7], the second thread will compute the output for Z = [8,15], + the third thread [16,24] and the fourth thread [25,31]. + + The algorithm outer loop iterates over Z, P, Y, X where P is the depth/3rd dimension of each kernel. This order is not arbitrary, the main benefit of this + is that we setup the neon registers containing the kernerl's values only once and then compute each XY using the preloaded registers as opposed as doing this for every XY value. + + The algorithm does not require allocating any additional memory amd computes the results directly in-place in two stages: + 1) Convolve plane 0 with kernel 0 and initialize the corresponding output plane with these values. + 2) Convolve the remaining planes and accumulate the results in the output's plane which has been initialized in step 1. + */ + + for(int oz = 0; oz < num_planes_z; ++oz) + { + uint8_t *p_out_base = out_ptr + oz * output_stride_z; + // Step 1 + { + const auto ptr_k_r0 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r1 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r2 = reinterpret_cast(k_ptr + 0 * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); + const auto vk_r0 = load_matrix_row(ptr_k_r0); + const auto vk_r1 = load_matrix_row(ptr_k_r1); + const auto vk_r2 = load_matrix_row(ptr_k_r2); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + auto in_top = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 0) * input_stride_y); + auto in_mid = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 1) * input_stride_y); + auto in_low = reinterpret_cast(input_ptr + 0 * input_stride_z + (ih + 2) * input_stride_y); + auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, + in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) + { + auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position); + store_results(p_out, vres); + } + } + } + // Step 2 + for(int p = 1; p < kernel_depth; ++p) + { + const auto ptr_k_r0 = reinterpret_cast(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 0 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r1 = reinterpret_cast(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 1 * kernel_stride_y + 0 * kernel_stride_x); + const auto ptr_k_r2 = reinterpret_cast(k_ptr + p * kernel_stride_z + (id.z() + oz) * kernel_stride_w + 2 * kernel_stride_y + 0 * kernel_stride_x); + const auto vk_r0 = load_matrix_row(ptr_k_r0); + const auto vk_r1 = load_matrix_row(ptr_k_r1); + const auto vk_r2 = load_matrix_row(ptr_k_r2); + for(ih = 0, oh = 0; oh < output_h; ++oh, ih += conv_stride_y) + { + auto in_top = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 0) * input_stride_y); + auto in_mid = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 1) * input_stride_y); + auto in_low = reinterpret_cast(input_ptr + p * input_stride_z + (ih + 2) * input_stride_y); + auto p_out = reinterpret_cast(p_out_base + oh * output_stride_y); + for(int ow = 0; ow < output_w; ow += num_elems_written_per_iteration, + in_top += delta_input, in_mid += delta_input, in_low += delta_input, p_out += num_elems_written_per_iteration) + { + auto vres = convolve_3x3(in_top, in_mid, in_low, vk_r0, vk_r1, vk_r2, fixed_point_position); + accumulate_results(p_out, vres); + } + } + } + } + }, + in, out); + } +}; + +template +inline void convolve_1x1(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +{ + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + switch(conv_stride_x) + { + case 1: + convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + break; + case 2: + convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + break; + case 3: + convolver_1x1::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} + +template +inline void convolve_3x3(const Window &window, unsigned int num_elems_read_per_iteration, unsigned int num_elems_written_per_iteration, + const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +{ + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + switch(conv_stride_x) + { + case 1: + convolver_3x3::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + break; + case 2: + convolver_3x3::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + break; + case 3: + convolver_3x3::convolve(window, num_elems_read_per_iteration, num_elems_written_per_iteration, input, weights, output, conv_info); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace + +NEDirectConvolutionLayerKernel::NEDirectConvolutionLayerKernel() + : _input(nullptr), _weights(nullptr), _output(nullptr), _conv_info(), _border_size(0), _kernel_size(0), _num_elems_read_per_iteration(0), _num_elems_written_per_iteration(0) +{ +} + +BorderSize NEDirectConvolutionLayerKernel::border_size() const +{ + return _border_size; +} + +void NEDirectConvolutionLayerKernel::configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 1 && (std::get<0>(conv_info.pad()) || std::get<1>(conv_info.pad())), + "Pad > 0 not supported for 1x1 weights"); + ARM_COMPUTE_ERROR_ON_MSG(weights->info()->dimension(0) == 3 && (std::get<0>(conv_info.pad()) > 1 || std::get<1>(conv_info.pad()) > 1), + "Pad > 1 not supported for 3x3 weights"); + ARM_COMPUTE_ERROR_ON_MSG(std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported."); + + const unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + const unsigned int conv_pad_x = std::get<0>(conv_info.pad()); + const unsigned int conv_pad_y = std::get<1>(conv_info.pad()); + + _input = input; + _weights = weights; + _output = output; + _conv_info = conv_info; + _kernel_size = weights->info()->dimension(0); + _border_size = BorderSize(conv_pad_y, conv_pad_x); + + Window win = calculate_max_window(*output->info()); + + switch(_kernel_size) + { + case 1: + { + _num_elems_written_per_iteration = (input->info()->data_type() == DataType::QS8) ? 8 : 4; + _num_elems_read_per_iteration = conv_stride_x * _num_elems_written_per_iteration; + + win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, _num_elems_read_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + break; + } + case 3: + { + if(input->info()->data_type() == DataType::F32) + { + _num_elems_read_per_iteration = 12; + _num_elems_written_per_iteration = 16 >> conv_stride_x; + } + else + { + _num_elems_read_per_iteration = 24; + _num_elems_written_per_iteration = 32 >> conv_stride_x; + } + + // Calculate right and bottom border + const unsigned int conv_stride_y = std::get<1>(_conv_info.stride()); + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + const int upper_bound_w = ceil_to_multiple(((output->info()->dimension(0) - 1) * conv_stride_x + _kernel_size), _num_elems_read_per_iteration) - conv_pad_x - input_width; + const int upper_bound_h = ((output->info()->dimension(1) - 1) * conv_stride_y - conv_pad_y + _kernel_size) - input_height; + _border_size.right = std::max(upper_bound_w, static_cast(_kernel_size)); + _border_size.bottom = std::max(upper_bound_h, static_cast(_kernel_size)); + + // Create window and update padding + win = calculate_max_window(*output->info(), Steps(_num_elems_written_per_iteration)); + AccessWindowStatic input_access(input->info(), -conv_pad_x, -conv_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); + AccessWindowStatic weights_access(weights->info(), 0, 0, _kernel_size, _kernel_size); + AccessWindowHorizontal output_access(output->info(), 0, _num_elems_written_per_iteration); + update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + break; + } + default: + { + ARM_COMPUTE_ERROR("Not implemented"); + break; + } + } + + INEKernel::configure(win); +} + +void NEDirectConvolutionLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + const int kernel_size = _weights->info()->dimension(0); + + switch(kernel_size) + { + case 1: + { + if(_input->info()->data_type() == DataType::QS8) + { + convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + } + else + { + convolve_1x1(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + } + break; + } + case 3: + { + if(_input->info()->data_type() == DataType::QS8) + { + convolve_3x3(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + } + else + { + convolve_3x3(window, _num_elems_read_per_iteration, _num_elems_written_per_iteration, _input, _weights, _output, _conv_info); + } + break; + } + default: + { + ARM_COMPUTE_ERROR("Only kernel sizes 1x1 and 3x3 are supported."); + break; + } + } +} diff --git a/src/core/NEON/kernels/NEErodeKernel.cpp b/src/core/NEON/kernels/NEErodeKernel.cpp new file mode 100644 index 0000000000..398503627c --- /dev/null +++ b/src/core/NEON/kernels/NEErodeKernel.cpp @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEErodeKernel.h" + +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +BorderSize NEErodeKernel::border_size() const +{ + return BorderSize(1); +} + +void NEErodeKernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEErodeKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator in(_input, window); + Iterator out(_output, window); + + const size_t in_stride = _input->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates &) + { + uint8_t *in_ptr = in.ptr() - 1; + const uint8x16_t top_data = vld1q_u8(in_ptr - in_stride); + const uint8x16_t mid_data = vld1q_u8(in_ptr); + const uint8x16_t bot_data = vld1q_u8(in_ptr + in_stride); + + uint8x8_t top_high_data = vget_high_u8(top_data); + uint8x8_t top_low_data = vget_low_u8(top_data); + + uint8x8_t mid_high_data = vget_high_u8(mid_data); + uint8x8_t mid_low_data = vget_low_u8(mid_data); + + uint8x8_t bot_high_data = vget_high_u8(bot_data); + uint8x8_t bot_low_data = vget_low_u8(bot_data); + + uint8x8_t p0, p1; + + p0 = top_low_data; + p1 = vext_u8(top_low_data, top_high_data, 1); + p0 = vmin_u8(p0, p1); + + p1 = vext_u8(top_low_data, top_high_data, 2); + p0 = vmin_u8(p0, p1); + + p1 = mid_low_data; + p0 = vmin_u8(p0, p1); + + p1 = vext_u8(mid_low_data, mid_high_data, 1); + p0 = vmin_u8(p0, p1); + + p1 = vext_u8(mid_low_data, mid_high_data, 2); + p0 = vmin_u8(p0, p1); + + p1 = bot_low_data; + p0 = vmin_u8(p0, p1); + + p1 = vext_u8(bot_low_data, bot_high_data, 1); + p0 = vmin_u8(p0, p1); + + p1 = vext_u8(bot_low_data, bot_high_data, 2); + p0 = vmin_u8(p0, p1); + + vst1_u8(out.ptr(), p0); + }, + in, out); +} diff --git a/src/core/NEON/kernels/NEFastCornersKernel.cpp b/src/core/NEON/kernels/NEFastCornersKernel.cpp new file mode 100644 index 0000000000..9e8b5526a1 --- /dev/null +++ b/src/core/NEON/kernels/NEFastCornersKernel.cpp @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +NEFastCornersKernel::NEFastCornersKernel() + : INEKernel(), _input(nullptr), _output(nullptr), _threshold(0), _non_max_suppression(false) +{ +} + +namespace +{ +constexpr size_t PERMUTATIONS = 16; +constexpr size_t PERM_SIZE = 16; + +inline uint8x8x2_t create_permutation_index(size_t k) +{ + ARM_COMPUTE_ERROR_ON(k >= PERMUTATIONS); + + static const uint8_t permutations_table[PERMUTATIONS][PERM_SIZE] + { + { 0, 1, 2, 3, 4, 5, 6, 7, 8, 255, 255, 255, 255, 255, 255, 255 }, + { 15, 0, 1, 2, 3, 4, 5, 6, 7, 255, 255, 255, 255, 255, 255, 255 }, + { 14, 15, 0, 1, 2, 3, 4, 5, 6, 255, 255, 255, 255, 255, 255, 255 }, + { 13, 14, 15, 0, 1, 2, 3, 4, 5, 255, 255, 255, 255, 255, 255, 255 }, + { 12, 13, 14, 15, 0, 1, 2, 3, 4, 255, 255, 255, 255, 255, 255, 255 }, + { 11, 12, 13, 14, 15, 0, 1, 2, 3, 255, 255, 255, 255, 255, 255, 255 }, + { 10, 11, 12, 13, 14, 15, 0, 1, 2, 255, 255, 255, 255, 255, 255, 255 }, + { 9, 10, 11, 12, 13, 14, 15, 0, 1, 255, 255, 255, 255, 255, 255, 255 }, + { 8, 9, 10, 11, 12, 13, 14, 15, 0, 255, 255, 255, 255, 255, 255, 255 }, + { 7, 8, 9, 10, 11, 12, 13, 14, 15, 255, 255, 255, 255, 255, 255, 255 }, + { 6, 7, 8, 9, 10, 11, 12, 13, 14, 255, 255, 255, 255, 255, 255, 255 }, + { 5, 6, 7, 8, 9, 10, 11, 12, 13, 255, 255, 255, 255, 255, 255, 255 }, + { 4, 5, 6, 7, 8, 9, 10, 11, 12, 255, 255, 255, 255, 255, 255, 255 }, + { 3, 4, 5, 6, 7, 8, 9, 10, 11, 255, 255, 255, 255, 255, 255, 255 }, + { 2, 3, 4, 5, 6, 7, 8, 9, 10, 255, 255, 255, 255, 255, 255, 255 }, + { 1, 2, 3, 4, 5, 6, 7, 8, 9, 255, 255, 255, 255, 255, 255, 255 } + + }; + + const uint8x8x2_t index = + { + { + vld1_u8(permutations_table[k]), + vld1_u8(permutations_table[k] + 8) + } + }; + + return index; +} + +inline uint8x8x4_t create_circle_index_register() +{ + /* + This function creates the index registers to retrieve the 16 texels in the Bresenham circle of radius 3 with center in P. + + . . F 0 1 . . . + . E . . . 2 . . + D . . . . . 3 . + C . . P . . 4 . + B . . . . . 5 . + . A . . . 6 . . + . . 9 8 7 . . . + + Where . is an irrelevant texel value + + We want to retrieve all texels [0,F] + + The 4 registers in r will then be used to get these texels out of two tables in the function get_circle_texels() + + The first table holds the top 4 rows of texels + . . F 0 1 . . . + . E . . . 2 . . + D . . . . . 3 . + C . . P . . 4 . + + The second table the bottom 3 rows of texels + B . . . . . 5 . + . A . . . 6 . . + . . 9 8 7 . . . + + */ + static const uint8_t top_right[8] = + { + /* The register r.val[0] will be used to retrieve these texels: + . . . 0 1 . . . + . . . . . 2 . . + . . . . . . 3 . + . . . . . . 4 . + */ + 3 /* top table, first row, elem 4, value 0 in the diagram above */, + 4 /* top table, first row, elem 5, value 1 in the diagram above */, + 13 /* top table, second row, elem 6, value 2 in the diagram above */, + 22 /* top table, third row, elem 7, value 3 in the diagram above*/, + 30 /* top table, fourth row, elem 7, value 4 in the diagram above*/, + 255, + 255, + 255 + }; + + static const uint8_t bottom_right[8] = + { + /* The register r.val[1] will be used to retrieve these texels: + . . . . . . 5 . + . . . . . 6 . . + . . . . 7 . . . + */ + 255, + 255, + 255, + 255, + 255, + 6 /* low table, first row, elem 7, value 5 in the diagram above*/, + 13 /* low table, second row, elem 6, value 6 in the diagram above*/, + 20 /* low table, third row, elem 5, value 7 in the diagram above*/ + }; + + static const uint8_t top_left[8] = + { + /* The register r.val[2] will be used to retrieve these texels: + . . F . . . . . + . E . . . . . . + D . . . . . . . + C . . . . . . . + */ + 255, + 255, + 255, + 255, + 24 /* top table, fourth row, elem 1, value C in the diagram above */, + 16 /* top table, third row, elem 1, value D in the diagram above*/, + 9 /* top table, second row, elem 2, value E in the diagram above*/, + 2 /* top table, first row, elem 3, value F in the diagram above*/ + }; + + static const uint8_t bottom_left[8] = + { + /* The register r.val[3] will be used to retrieve these texels: + B . . . . . . . + . A . . . . . . + . . 9 8 . . . . + */ + 19 /* low table, third row, elem 4, value 8 in the diagram above */, + 18 /* low table, third row, elem 3, value 9 in the diagram above */, + 9 /* low table, second row, elem 2, value A in the diagram above */, + 0 /* low table, first row, elem 1, value B in the diagram above */, + 255, + 255, + 255, + 255 + }; + + const uint8x8x4_t reg = + { + { + vld1_u8(top_right), + vld1_u8(bottom_right), + vld1_u8(top_left), + vld1_u8(bottom_left) + } + }; + + return reg; +} + +inline uint8x16_t get_circle_texels(const uint8x8x4_t &index, const uint8x8x4_t &tbl_hi, const uint8x8x3_t &tbl_lo) +{ + /* + This function loads the 16 texels in the Bresenham circle of radius 3 into the register 'texels'. + The parameter 'index' is an array of indices which was previously setup in setup_circle_index_register(). + tbl_hi and tbl_lo are the two tables holding the texels in the window [(-3,-3),(+3,+3)] for a given texel P + */ + return vcombine_u8(vtbx3_u8(vtbl4_u8(tbl_hi, index.val[0]), tbl_lo, index.val[1]), + vtbx3_u8(vtbl4_u8(tbl_hi, index.val[2]), tbl_lo, index.val[3])); +} + +inline uint8x16_t get_permutation_texels(const uint8x8x2_t &permutation_index, const uint8x8x2_t &tbl_circle) +{ + /* + This function stores the 9 texels of a give permutation X in the neon register 'texels' + + 'tbl_circle' is a LUT with the texels 0 to F + + . . F 0 1 . . . + . E . . . 2 . . + D . . . . . 3 . + C . . P . . 4 . + B . . . . . 5 . + . A . . . 6 . . + . . 9 8 7 . . . + + 'permutation_index' is one of the permutations below: + + { 0, 1, 2, 3, 4, 5, 6, 7, 8}, + { F, 0, 1, 2, 3, 4, 5, 6, 7}, + { E, F, 0, 1, 2, 3, 4, 5, 6}, + { D, E, F, 0, 1, 2, 3, 4, 5}, + { C, D, E, F, 0, 1, 2, 3, 4}, + { B, C, D, E, F, 0, 1, 2, 3}, + { A, B, C, D, E, F, 0, 1, 2}, + { 9, A, B, C, D, E, F, 0, 1}, + { 8, 9, A, B, C, D, E, F, 0}, + { 7, 8, 9, A, B, C, D, E, F}, + { 6, 7, 8, 9, A, B, C, D, E}, + { 5, 6, 7, 8, 9, A, B, C, D}, + { 4, 5, 6, 7, 8, 9, A, B, C}, + { 3, 4, 5, 6, 7, 8, 9, A, B}, + { 2, 3, 4, 5, 6, 7, 8, 9, A}, + { 1, 2, 3, 4, 5, 6, 7, 8, 9}, + */ + static const uint8x8_t perm_right = vdup_n_u8(255); // init to 255 so that vtbx preserves the original values of the lanes + + return vcombine_u8(vtbl2_u8(tbl_circle, permutation_index.val[0]), + vtbx2_u8(perm_right, tbl_circle, permutation_index.val[1])); +} + +inline bool is_permutation_brighter(const uint8x16_t &permutation, const uint8x16_t &pg) +{ + const uint8x16_t res_gt = vcgtq_u8(permutation, pg); + + return vget_lane_u64(vreinterpret_u64_u8(vand_u8(vget_high_u8(res_gt), vget_low_u8(res_gt))), 0) == std::numeric_limits::max(); +} + +inline bool is_permutation_darker(const uint8x16_t &permutation, const uint8x16_t &pl) +{ + const uint8x16_t res_lt = vcltq_u8(permutation, pl); + const uint64x2_t u64res_lt = vreinterpretq_u64_u8(res_lt); + const uint64_t t3 = vgetq_lane_u64(u64res_lt, 0); + const uint64_t t4 = vgetq_lane_u64(u64res_lt, 1); + + return std::numeric_limits::max() == t3 && 255 == t4; +} + +inline bool is_permutation_corner(const uint8x16_t &permutation, const uint8x16_t &pg, const uint8x16_t &pl) +{ + return is_permutation_brighter(permutation, pg) || is_permutation_darker(permutation, pl); +} + +inline bool point_is_fast_corner(uint8_t p, uint8_t threshold, const uint8x8x2_t &tbl_circle_texels, uint8x8x2_t perm_indices[PERMUTATIONS]) +{ + /* + This function determines whether the point 'p' is a corner. + */ + uint8x16_t pg = vqaddq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold)); + uint8x16_t pl = vqsubq_u8(vdupq_n_u8(p), vdupq_n_u8(threshold)); + + bool corner_detected = false; + + for(size_t j = 0; !corner_detected && j < PERMUTATIONS; ++j) + { + const uint8x16_t pe_texels = get_permutation_texels(perm_indices[j], tbl_circle_texels); + corner_detected = is_permutation_corner(pe_texels, pg, pl); + } + + return corner_detected; +} + +inline uint8x8x2_t create_circle_tbl(const uint8_t *const __restrict buffer[7], size_t in_offset, const uint8x8x4_t &circle_index_r) +{ + /* + This function builds a LUT holding the 16 texels in the Brensenham circle radius 3. + circle_index_r is a vector of 4 registers to retrieve the texels from the two tables mentioned above. + */ + + //Load the texels in the window [(x-3,y-3),(x+3,y+3)]. + //The top 4 rows are loaded in tbl_hi and the low 3 rows in tbl_lo. + //These two tables are then used to retrieve the texels in the Bresenham circle of radius 3. + const uint8x8x4_t tbl_window_hi = + { + { + vld1_u8(buffer[0] + in_offset), + vld1_u8(buffer[1] + in_offset), + vld1_u8(buffer[2] + in_offset), + vld1_u8(buffer[3] + in_offset) + } + }; + + const uint8x8x3_t tbl_window_lo = + { + { + vld1_u8(buffer[4] + in_offset), + vld1_u8(buffer[5] + in_offset), + vld1_u8(buffer[6] + in_offset) + } + }; + + const uint8x16_t circle_texels = get_circle_texels(circle_index_r, tbl_window_hi, tbl_window_lo); + + const uint8x8x2_t tbl_circle_texels = + { + { + vget_low_u8(circle_texels), + vget_high_u8(circle_texels) + } + }; + + return tbl_circle_texels; +} + +inline uint8_t get_point_score(uint8_t p, uint8_t tolerance, const uint8x8x2_t &tbl_circle, uint8x8x2_t perm_indices[PERMUTATIONS]) +{ + uint8_t b = 255; + uint8_t a = tolerance; + + while(b - a > 1) + { + const uint16_t ab = a + b; + const uint8_t c = ab >> 1; + + if(point_is_fast_corner(p, c, tbl_circle, perm_indices)) + { + a = c; + } + else + { + b = c; + } + } + + return a; +} +} // namespace + +BorderSize NEFastCornersKernel::border_size() const +{ + return BorderSize(3); +} + +void NEFastCornersKernel::configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MSG(border_undefined == false, "Not implemented"); + + _input = input; + _output = output; + _threshold = threshold; + _non_max_suppression = non_max_suppression; + + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 1; + constexpr unsigned int num_rows_read_per_iteration = 7; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + AccessWindowRectangle input_access(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEFastCornersKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + std::array perm_index{ {} }; + /* + We use a LUT loaded with 7 rows of uint8_t from the input image [-3,-3]...[+3,+3] to retrieve the texels in the Brensenham circle radius 3 and put them in one neon register uint8x16_t. + The three lines below setup the neon index registers to get these texels out from the table + */ + const uint8x8x4_t circle_index_r = create_circle_index_register(); + /* + We put the 16 texels (circle) in a LUT to easily generate all the permutations. The for block below setups the indices for each permutation. + */ + for(size_t k = 0; k < PERMUTATIONS; ++k) + { + perm_index[k] = create_permutation_index(k); + } + + Iterator in(_input, window); + Iterator out(_output, window); + + const uint8_t *const __restrict in_row[7] = + { + _input->ptr_to_element(Coordinates(-3, -3)), + _input->ptr_to_element(Coordinates(-3, -2)), + _input->ptr_to_element(Coordinates(-3, -1)), + _input->ptr_to_element(Coordinates(-3, 0)), + _input->ptr_to_element(Coordinates(-3, 1)), + _input->ptr_to_element(Coordinates(-3, 2)), + _input->ptr_to_element(Coordinates(-3, 3)) + }; + + auto is_rejected = [](uint8_t p, uint8_t q, uint8_t a, uint8_t b) + { + const bool p_is_in_ab = (a <= p) && (p <= b); + const bool q_is_in_ab = (a <= q) && (q <= b); + return p_is_in_ab && q_is_in_ab; + }; + + execute_window_loop(window, [&](const Coordinates & id) + { + const size_t in_offset = in.offset(); + const uint8_t p0 = *in.ptr(); + const uint8_t b = std::min(p0 + _threshold, 255); + const uint8_t a = std::max(p0 - _threshold, 0); + uint8_t score = 0; + /* + Fast check to discard points which cannot be corners and avoid the expensive computation of the potential 16 permutations + + pixels 1 and 9 are examined, if both I1 and I9 are within [Ip - t, Ip + t], then candidate p is not a corner. + */ + const uint8_t p1 = (in_offset + in_row[0])[3]; + const uint8_t p9 = (in_offset + in_row[6])[3]; + + if(!is_rejected(p1, p9, a, b)) + { + /* pixels 5 and 13 are further examined to check whether three of them are brighter than Ip + t or darker than Ip - t */ + const uint8_t p5 = (in_offset + in_row[3])[6]; + const uint8_t p13 = (in_offset + in_row[3])[0]; + + if(!is_rejected(p5, p13, a, b)) + { + /* at this stage we use the full test with the 16 permutations to classify the point as corner or not */ + const uint8x8x2_t tbl_circle_texel = create_circle_tbl(in_row, in_offset, circle_index_r); + + if(point_is_fast_corner(p0, _threshold, tbl_circle_texel, perm_index.data())) + { + if(_non_max_suppression) + { + score = get_point_score(p0, _threshold, tbl_circle_texel, perm_index.data()); + } + else + { + score = 1; + } + } + } + } + + *out.ptr() = score; + }, + in, out); +} diff --git a/src/core/NEON/kernels/NEFillArrayKernel.cpp b/src/core/NEON/kernels/NEFillArrayKernel.cpp new file mode 100644 index 0000000000..7e7e1c2501 --- /dev/null +++ b/src/core/NEON/kernels/NEFillArrayKernel.cpp @@ -0,0 +1,91 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +NEFillArrayKernel::NEFillArrayKernel() + : _input(nullptr), _output(nullptr), _threshold(0) +{ +} + +void NEFillArrayKernel::configure(const IImage *input, uint8_t threshold, IKeyPointArray *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + _input = input; + _output = output; + _threshold = threshold; + + constexpr unsigned int num_elems_processed_per_iteration = 1; + constexpr unsigned int num_elems_read_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_read_per_iteration)); + + INEKernel::configure(win); +} + +bool NEFillArrayKernel::is_parallelisable() const +{ + return false; +} + +void NEFillArrayKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Iterator input(_input, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8_t value = *input.ptr(); + + if(value >= _threshold) + { + KeyPoint p; + p.x = id.x(); + p.y = id.y(); + p.strength = value; + p.tracking_status = 1; + + if(!_output->push_back(p)) + { + return; //Overflowed: stop trying to add more points + } + } + }, + input); +} diff --git a/src/core/NEON/kernels/NEFillBorderKernel.cpp b/src/core/NEON/kernels/NEFillBorderKernel.cpp new file mode 100644 index 0000000000..bd99242b11 --- /dev/null +++ b/src/core/NEON/kernels/NEFillBorderKernel.cpp @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEFillBorderKernel::NEFillBorderKernel() + : _tensor(nullptr), _border_size(0), _mode(BorderMode::UNDEFINED), _constant_border_value(0) +{ +} + +void NEFillBorderKernel::configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(tensor, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F32); + + _tensor = tensor; + _border_size = border_size; + _mode = border_mode; + _constant_border_value = constant_border_value; + + _border_size.limit(tensor->info()->padding()); + + Window win; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + win.use_tensor_dimensions(_tensor->info(), Window::DimZ); + INEKernel::configure(win); +} + +void NEFillBorderKernel::run(const Window &window) +{ + // If there is no border: early exit + if(_border_size.empty()) + { + return; + } + + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + switch(_mode) + { + case BorderMode::CONSTANT: + { + switch(_tensor->info()->data_type()) + { + case DataType::U8: + fill_constant_value_single_channel(window); + break; + case DataType::QS8: + case DataType::S8: + fill_constant_value_single_channel(window); + break; + case DataType::U16: + fill_constant_value_single_channel(window); + break; + case DataType::S16: + case DataType::QS16: + fill_constant_value_single_channel(window); + break; + case DataType::U32: + fill_constant_value_single_channel(window); + break; + case DataType::S32: + fill_constant_value_single_channel(window); + break; + case DataType::F32: + static_assert(sizeof(float) == 4, "Float must be 32 bit"); + fill_constant_value_single_channel(window); + break; + default: + ARM_COMPUTE_ERROR("Not handled"); + } + break; + } + case BorderMode::REPLICATE: + { + switch(_tensor->info()->data_type()) + { + case DataType::U8: + fill_replicate_single_channel(window); + break; + case DataType::QS8: + case DataType::S8: + fill_replicate_single_channel(window); + break; + case DataType::U16: + fill_replicate_single_channel(window); + break; + case DataType::S16: + case DataType::QS16: + fill_replicate_single_channel(window); + break; + case DataType::U32: + fill_replicate_single_channel(window); + break; + case DataType::S32: + fill_replicate_single_channel(window); + break; + case DataType::F32: + static_assert(sizeof(float) == 4, "Float must be 32 bit"); + fill_replicate_single_channel(window); + break; + default: + ARM_COMPUTE_ERROR("Not handled"); + } + break; + } + case BorderMode::UNDEFINED: + break; // Nothing to do here + default: + ARM_COMPUTE_ERROR("Unknown border mode"); + } +} + +template +void NEFillBorderKernel::fill_replicate_single_channel(const Window &window) +{ + uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor); + const size_t &width = _tensor->info()->valid_region().shape[0]; + const size_t &height = _tensor->info()->valid_region().shape[1]; + + // Left and right border + Window vertical(window); + vertical.set(Window::DimY, Window::Dimension(0, height, 1)); + + Iterator vertical_it(_tensor, vertical); + + execute_window_loop(vertical, [&](const Coordinates & id) + { + const auto row_start = reinterpret_cast(start_valid_region + vertical_it.offset()); + const auto left_val = *reinterpret_cast(vertical_it.ptr()); + const auto right_val = *(reinterpret_cast(vertical_it.ptr()) + width - 1); + + // Fill left and right borders + std::fill_n(row_start - _border_size.left, _border_size.left, left_val); + std::fill_n(row_start + width, _border_size.right, right_val); + }, + vertical_it); + + // Top and bottom border + Iterator plane_it(_tensor, window); + + // Iterate over all XY planes + execute_window_loop(window, [&](const Coordinates & id) + { + const auto first_row = reinterpret_cast(start_valid_region + plane_it.offset()); + + // Top border + for(int i = -_border_size.top; i < 0; ++i) + { + const auto row_start = reinterpret_cast(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]); + + // Copy top rows including left/right borders + std::copy_n(first_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left); + } + + const auto last_row = reinterpret_cast(start_valid_region + plane_it.offset() + (height - 1) * _tensor->info()->strides_in_bytes()[1]); + + // Bottom border + for(unsigned int i = height; i < height + _border_size.bottom; ++i) + { + const auto row_start = reinterpret_cast(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]); + + // Copy bottom rows including left/right borders + std::copy_n(last_row - _border_size.left, _border_size.left + width + _border_size.right, row_start - _border_size.left); + } + }, + plane_it); +} + +template +void NEFillBorderKernel::fill_constant_value_single_channel(const Window &window) +{ + T constant_border_value; + _constant_border_value.get(constant_border_value); + + uint8_t *const start_valid_region = _tensor->ptr_to_element(_tensor->info()->valid_region().anchor); + const size_t &width = _tensor->info()->valid_region().shape[0]; + const size_t &height = _tensor->info()->valid_region().shape[1]; + + // Left and right border + Window vertical(window); + vertical.set(Window::DimY, Window::Dimension(0, height, 1)); + + Iterator vertical_it(_tensor, vertical); + + execute_window_loop(vertical, [&](const Coordinates & id) + { + const auto row_start = reinterpret_cast(start_valid_region + vertical_it.offset()); + + // Fill left and right borders + std::fill_n(row_start - _border_size.left, _border_size.left, constant_border_value); + std::fill_n(row_start + width, _border_size.right, constant_border_value); + }, + vertical_it); + + // Top and bottom border + Iterator plane_it(_tensor, window); + + // Iterate over all XY planes + execute_window_loop(window, [&](const Coordinates & id) + { + // Top border + for(int i = -_border_size.top; i < 0; ++i) + { + const auto row_start = reinterpret_cast(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]); + + // Fill top rows including left/right borders + std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value); + } + + // Bottom border + for(unsigned int i = height; i < height + _border_size.bottom; ++i) + { + const auto row_start = reinterpret_cast(start_valid_region + plane_it.offset() + i * _tensor->info()->strides_in_bytes()[1]); + + // Fill bottom rows including left/right borders + std::fill_n(row_start - _border_size.left, _border_size.left + width + _border_size.right, constant_border_value); + } + }, + plane_it); +} diff --git a/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp new file mode 100644 index 0000000000..699a5d9299 --- /dev/null +++ b/src/core/NEON/kernels/NEFillInnerBorderKernel.cpp @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEFillInnerBorderKernel::NEFillInnerBorderKernel() + : _tensor(nullptr), _border_size(0), _constant_border_value(0) +{ +} + +void NEFillInnerBorderKernel::configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16, DataType::S32, DataType::F32); + + _tensor = input; + _border_size = border_size; + _constant_border_value = constant_border_value; + + Window win; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + win.set(Window::DimY, Window::Dimension(0, 1, 1)); + win.use_tensor_dimensions(_tensor->info(), Window::DimZ); + INEKernel::configure(win); +} + +void NEFillInnerBorderKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + // If there is no border: early exit + if(_border_size.empty()) + { + return; + } + + switch(_tensor->info()->data_type()) + { + case DataType::U8: + fill_value_single_channel(window); + break; + case DataType::S16: + fill_value_single_channel(window); + break; + case DataType::S32: + fill_value_single_channel(window); + break; + case DataType::F32: + static_assert(sizeof(float) == 4, "Float must be 32 bit"); + fill_value_single_channel(window); + break; + default: + ARM_COMPUTE_ERROR("Not handled"); + break; + } +} + +template +void NEFillInnerBorderKernel::fill_value_single_channel(const Window &window) +{ + const size_t stride = _tensor->info()->strides_in_bytes()[1]; + const size_t width = _tensor->info()->dimension(0); + const size_t height = _tensor->info()->dimension(1); + + T constant_border_value; + _constant_border_value.get(constant_border_value); + + // Left and right border + // All X values are set at once + Window vertical(window); + vertical.set(Window::DimY, Window::Dimension(0, height, 1)); + + Iterator vertical_it(_tensor, vertical); + + execute_window_loop(vertical, [&](const Coordinates & id) + { + std::fill_n(reinterpret_cast(vertical_it.ptr()), _border_size.left, constant_border_value); + std::fill_n(reinterpret_cast(vertical_it.ptr()) + width - _border_size.right, _border_size.right, constant_border_value); + }, + vertical_it); + + // Top and bottom border + // All values are set at once + Iterator horizontal_it(_tensor, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + for(size_t i = 0; i < _border_size.top; ++i) + { + std::fill_n(reinterpret_cast(horizontal_it.ptr() + i * stride), width, constant_border_value); + } + + for(size_t i = 0; i < _border_size.bottom; ++i) + { + std::fill_n(reinterpret_cast(horizontal_it.ptr() + (height - i - 1) * stride), width, constant_border_value); + } + }, + horizontal_it); +} diff --git a/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp new file mode 100644 index 0000000000..3ff8b7b201 --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMInterleave4x4Kernel.cpp @@ -0,0 +1,191 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +namespace +{ +void gemm_interleave_8bit_elements(const ITensor *input, ITensor *output, const Window &window) +{ + const size_t in_stride = input->info()->strides_in_bytes()[1]; + + // Set window for output tensor + Window win_out(window); + win_out.scale(Window::DimY, 0.25f); + Iterator in(input, window); + + win_out.set_dimension_step(Window::DimX, 32); + Iterator out(output, win_out); + + execute_window_loop(window, [&](const Coordinates &) + { + const uint8x8x4_t data = + { + { + vld1_u8(in.ptr() + 0 * in_stride), + vld1_u8(in.ptr() + 1 * in_stride), + vld1_u8(in.ptr() + 2 * in_stride), + vld1_u8(in.ptr() + 3 * in_stride), + } + }; + vst4_u8(out.ptr(), data); + }, + in, out); +} + +void gemm_interleave_16bit_elements(const ITensor *input, ITensor *output, const Window &window) +{ + const size_t in_stride = input->info()->strides_in_bytes()[1]; + + // Set window for output tensor + Window win_out(window); + win_out.scale(Window::DimY, 0.25f); + Iterator in(input, window); + + win_out.set_dimension_step(Window::DimX, 16); + Iterator out(output, win_out); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x4x4_t data = + { + { + vld1_u16(reinterpret_cast(in.ptr() + 0 * in_stride)), + vld1_u16(reinterpret_cast(in.ptr() + 1 * in_stride)), + vld1_u16(reinterpret_cast(in.ptr() + 2 * in_stride)), + vld1_u16(reinterpret_cast(in.ptr() + 3 * in_stride)), + } + }; + vst4_u16(reinterpret_cast(out.ptr()), data); + }, + in, out); +} + +void gemm_interleave_32bit_elements(const ITensor *input, ITensor *output, const Window &window) +{ + const size_t in_stride = input->info()->strides_in_bytes()[1]; + + // Set window for output tensor + Window win_out(window); + win_out.scale(Window::DimY, 0.25f); + Iterator in(input, window); + + win_out.set_dimension_step(Window::DimX, 16); + Iterator out(output, win_out); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint32x4x4_t data = + { + { + vld1q_u32(reinterpret_cast(in.ptr() + 0 * in_stride)), + vld1q_u32(reinterpret_cast(in.ptr() + 1 * in_stride)), + vld1q_u32(reinterpret_cast(in.ptr() + 2 * in_stride)), + vld1q_u32(reinterpret_cast(in.ptr() + 3 * in_stride)) + } + }; + vst4q_u32(reinterpret_cast(out.ptr()), data); + }, + in, out); +} +} // namespace + +NEGEMMInterleave4x4Kernel::NEGEMMInterleave4x4Kernel() + : _func(nullptr) +{ +} + +void NEGEMMInterleave4x4Kernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) != input->info()->dimension(0) * 4); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) != std::ceil(input->info()->dimension(1) / 4.0f)); + + _input = input; + _output = output; + + unsigned int num_elems_processed_per_iteration_x = 4; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + switch(input->info()->element_size()) + { + case 1: + num_elems_processed_per_iteration_x = 8; + _func = &gemm_interleave_8bit_elements; + break; + case 2: + _func = &gemm_interleave_16bit_elements; + break; + case 4: + _func = &gemm_interleave_32bit_elements; + break; + default: + ARM_COMPUTE_ERROR_ON("Element size not supported"); + break; + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x * num_elems_processed_per_iteration_y, 1, 4.0f, 0.25f); + AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + update_window_and_padding(win, output_access, input_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEGEMMInterleave4x4Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + /* + * This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | + * |a30 a31 a32 a33| + * + * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + */ + (*_func)(_input, _output, window); +} diff --git a/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..3558c686b1 --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.cpp @@ -0,0 +1,423 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEGEMMLowpMatrixMultiplyKernel::NEGEMMLowpMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr), _a_offset(0), _b_offset(0), _output_offset(0), _output_mult_int(0), _shift(0) +{ +} + +void NEGEMMLowpMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, + int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + + _input0 = input0; + _input1 = input1; + _output = output; + _a_offset = a_offset; + _b_offset = b_offset; + _output_offset = output_offset; + _output_mult_int = output_mult_int; + _shift = shift; + + constexpr unsigned int num_elems_processed_per_iteration_x = 16; + constexpr unsigned int num_elems_processed_per_iteration_y = 4; + + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowHorizontal in0_access(input0->info(), 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal in1_access(input1->info(), 0, num_elems_processed_per_iteration_x); + + update_window_and_padding(win, in0_access, in1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + INEKernel::configure(win); +} + +void NEGEMMLowpMatrixMultiplyKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const size_t in_b_stride = _input1->info()->strides_in_bytes()[1]; + const size_t out_stride = _output->info()->strides_in_bytes()[1]; + + /* Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix */ + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() >> 2, window.y().end() >> 2, 1)); + + /* Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix */ + Window win_b(window); + win_b.set(Window::DimX, Window::Dimension(window.x().start() >> 4, window.x().end() >> 4, in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + /* The step x and step y for the output matrix has been already set using in configure() */ + Iterator ina(_input0, win_a); + Iterator inb(_input1, win_b); + Iterator out(_output, window); + + const int32x4_t voffset_a = vdupq_n_s32(_a_offset); + const int32x4_t voffset_b = vdupq_n_s32(_b_offset); + const int32x4_t vshiftr = vdupq_n_s32(-_shift); + + const int width_b = _input1->info()->dimension(0); + + // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW + // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration + // All the values needed for computing a single 4x4 block will be read from consecutive memory positions + execute_window_loop(window, [&](const Coordinates &) + { + const uint8_t *mtx_a0 = ina.ptr(); + const uint8_t *mtx_b0 = inb.ptr(); + + // Accumulators for the block 0 + int32x4x4_t c0 = + { + { + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset) + } + }; + + // Accumulators for the block 1 + int32x4x4_t c1 = + { + { + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset) + } + }; + + // Accumulators for the block 2 + int32x4x4_t c2 = + { + { + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset) + } + }; + + // Accumulators for the block 3 + int32x4x4_t c3 = + { + { + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset), + vdupq_n_s32(_output_offset) + } + }; + + int k = 0; + // This for loop performs 4 accumulations per iteration + for(; k <= (width_b - 64); k += 64, mtx_a0 += 16, mtx_b0 += 64) + { + const uint8x8_t p00 = vld1_u8(mtx_a0 + 0); + const uint8x8_t p01 = vld1_u8(mtx_a0 + 8); + const uint8x8_t q00l = vld1_u8(mtx_b0 + 0); + const uint8x8_t q00h = vld1_u8(mtx_b0 + 8); + const uint8x8_t q01l = vld1_u8(mtx_b0 + 16); + const uint8x8_t q01h = vld1_u8(mtx_b0 + 24); + const uint8x8_t q02l = vld1_u8(mtx_b0 + 32); + const uint8x8_t q02h = vld1_u8(mtx_b0 + 40); + const uint8x8_t q03l = vld1_u8(mtx_b0 + 48); + const uint8x8_t q03h = vld1_u8(mtx_b0 + 56); + + const int32x4_t ia0l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00)))); + const int32x4_t ia0h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p00)))); + const int32x4_t ia1l = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p01)))); + const int32x4_t ia1h = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(p01)))); + + const int32x2x4_t ia0 = + { + { + vget_low_s32(ia0l), + vget_high_s32(ia0l), + vget_low_s32(ia0h), + vget_high_s32(ia0h) + } + }; + + const int32x2x4_t ia1 = + { + { + vget_low_s32(ia1l), + vget_high_s32(ia1l), + vget_low_s32(ia1h), + vget_high_s32(ia1h) + } + }; + + const int32x4x4_t ib0 = + { + { + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h)))) + } + }; + + const int32x4x4_t ib1 = + { + { + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q01h)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q01h)))) + } + }; + + const int32x4x4_t ib2 = + { + { + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q02h)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q02h)))) + } + }; + + const int32x4x4_t ib3 = + { + { + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q03h)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q03h)))) + } + }; + + // 4x4 block 0 - Accumulation 0 + c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia0.val[0], 0); + c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia0.val[0], 1); + c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia0.val[1], 0); + c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia0.val[1], 1); + // 4x4 block 0 - Accumulation 1 + c0.val[0] = vmlaq_lane_s32(c0.val[0], ib1.val[0], ia0.val[2], 0); + c0.val[1] = vmlaq_lane_s32(c0.val[1], ib1.val[0], ia0.val[2], 1); + c0.val[2] = vmlaq_lane_s32(c0.val[2], ib1.val[0], ia0.val[3], 0); + c0.val[3] = vmlaq_lane_s32(c0.val[3], ib1.val[0], ia0.val[3], 1); + // 4x4 block 0 - Accumulation 2 + c0.val[0] = vmlaq_lane_s32(c0.val[0], ib2.val[0], ia1.val[0], 0); + c0.val[1] = vmlaq_lane_s32(c0.val[1], ib2.val[0], ia1.val[0], 1); + c0.val[2] = vmlaq_lane_s32(c0.val[2], ib2.val[0], ia1.val[1], 0); + c0.val[3] = vmlaq_lane_s32(c0.val[3], ib2.val[0], ia1.val[1], 1); + // 4x4 block 0 - Accumulation 3 + c0.val[0] = vmlaq_lane_s32(c0.val[0], ib3.val[0], ia1.val[2], 0); + c0.val[1] = vmlaq_lane_s32(c0.val[1], ib3.val[0], ia1.val[2], 1); + c0.val[2] = vmlaq_lane_s32(c0.val[2], ib3.val[0], ia1.val[3], 0); + c0.val[3] = vmlaq_lane_s32(c0.val[3], ib3.val[0], ia1.val[3], 1); + + // 4x4 block 1 - Accumulation 0 + c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia0.val[0], 0); + c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia0.val[0], 1); + c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia0.val[1], 0); + c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia0.val[1], 1); + // 4x4 block 1 - Accumulation 1 + c1.val[0] = vmlaq_lane_s32(c1.val[0], ib1.val[1], ia0.val[2], 0); + c1.val[1] = vmlaq_lane_s32(c1.val[1], ib1.val[1], ia0.val[2], 1); + c1.val[2] = vmlaq_lane_s32(c1.val[2], ib1.val[1], ia0.val[3], 0); + c1.val[3] = vmlaq_lane_s32(c1.val[3], ib1.val[1], ia0.val[3], 1); + // 4x4 block 1 - Accumulation 2 + c1.val[0] = vmlaq_lane_s32(c1.val[0], ib2.val[1], ia1.val[0], 0); + c1.val[1] = vmlaq_lane_s32(c1.val[1], ib2.val[1], ia1.val[0], 1); + c1.val[2] = vmlaq_lane_s32(c1.val[2], ib2.val[1], ia1.val[1], 0); + c1.val[3] = vmlaq_lane_s32(c1.val[3], ib2.val[1], ia1.val[1], 1); + // 4x4 block 1 - Accumulation 3 + c1.val[0] = vmlaq_lane_s32(c1.val[0], ib3.val[1], ia1.val[2], 0); + c1.val[1] = vmlaq_lane_s32(c1.val[1], ib3.val[1], ia1.val[2], 1); + c1.val[2] = vmlaq_lane_s32(c1.val[2], ib3.val[1], ia1.val[3], 0); + c1.val[3] = vmlaq_lane_s32(c1.val[3], ib3.val[1], ia1.val[3], 1); + + // 4x4 block 2 - Accumulation 0 + c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia0.val[0], 0); + c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia0.val[0], 1); + c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia0.val[1], 0); + c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia0.val[1], 1); + // 4x4 block 2 - Accumulation 1 + c2.val[0] = vmlaq_lane_s32(c2.val[0], ib1.val[2], ia0.val[2], 0); + c2.val[1] = vmlaq_lane_s32(c2.val[1], ib1.val[2], ia0.val[2], 1); + c2.val[2] = vmlaq_lane_s32(c2.val[2], ib1.val[2], ia0.val[3], 0); + c2.val[3] = vmlaq_lane_s32(c2.val[3], ib1.val[2], ia0.val[3], 1); + // 4x4 block 2 - Accumulation 2 + c2.val[0] = vmlaq_lane_s32(c2.val[0], ib2.val[2], ia1.val[0], 0); + c2.val[1] = vmlaq_lane_s32(c2.val[1], ib2.val[2], ia1.val[0], 1); + c2.val[2] = vmlaq_lane_s32(c2.val[2], ib2.val[2], ia1.val[1], 0); + c2.val[3] = vmlaq_lane_s32(c2.val[3], ib2.val[2], ia1.val[1], 1); + // 4x4 block 2 - Accumulation 3 + c2.val[0] = vmlaq_lane_s32(c2.val[0], ib3.val[2], ia1.val[2], 0); + c2.val[1] = vmlaq_lane_s32(c2.val[1], ib3.val[2], ia1.val[2], 1); + c2.val[2] = vmlaq_lane_s32(c2.val[2], ib3.val[2], ia1.val[3], 0); + c2.val[3] = vmlaq_lane_s32(c2.val[3], ib3.val[2], ia1.val[3], 1); + + // 4x4 block 3 - Accumulation 0 + c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia0.val[0], 0); + c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia0.val[0], 1); + c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia0.val[1], 0); + c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia0.val[1], 1); + // 4x4 block 3 - Accumulation 1 + c3.val[0] = vmlaq_lane_s32(c3.val[0], ib1.val[3], ia0.val[2], 0); + c3.val[1] = vmlaq_lane_s32(c3.val[1], ib1.val[3], ia0.val[2], 1); + c3.val[2] = vmlaq_lane_s32(c3.val[2], ib1.val[3], ia0.val[3], 0); + c3.val[3] = vmlaq_lane_s32(c3.val[3], ib1.val[3], ia0.val[3], 1); + // 4x4 block 3 - Accumulation 2 + c3.val[0] = vmlaq_lane_s32(c3.val[0], ib2.val[3], ia1.val[0], 0); + c3.val[1] = vmlaq_lane_s32(c3.val[1], ib2.val[3], ia1.val[0], 1); + c3.val[2] = vmlaq_lane_s32(c3.val[2], ib2.val[3], ia1.val[1], 0); + c3.val[3] = vmlaq_lane_s32(c3.val[3], ib2.val[3], ia1.val[1], 1); + // 4x4 block 3 - Accumulation 3 + c3.val[0] = vmlaq_lane_s32(c3.val[0], ib3.val[3], ia1.val[2], 0); + c3.val[1] = vmlaq_lane_s32(c3.val[1], ib3.val[3], ia1.val[2], 1); + c3.val[2] = vmlaq_lane_s32(c3.val[2], ib3.val[3], ia1.val[3], 0); + c3.val[3] = vmlaq_lane_s32(c3.val[3], ib3.val[3], ia1.val[3], 1); + } + + // This for loop handles the left-over accumulations + for(; k < width_b; k += 16, mtx_a0 += 4, mtx_b0 += 16) + { + const uint8x8_t p00 = vld1_u8(mtx_a0); + const uint8x8_t q00l = vld1_u8(mtx_b0); + const uint8x8_t q00h = vld1_u8(mtx_b0 + 8); + + const int32x4_t ia0 = vaddw_s16(voffset_a, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(p00)))); + + const int32x2x2_t ia = + { + { + vget_low_s32(ia0), + vget_high_s32(ia0) + } + }; + + const int32x4x4_t ib0 = + { + { + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00l)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_low_u16(vmovl_u8(q00h)))), + vaddw_s16(voffset_b, vreinterpret_s16_u16(vget_high_u16(vmovl_u8(q00h)))) + } + }; + + // 4x4 block 0 + c0.val[0] = vmlaq_lane_s32(c0.val[0], ib0.val[0], ia.val[0], 0); + c0.val[1] = vmlaq_lane_s32(c0.val[1], ib0.val[0], ia.val[0], 1); + c0.val[2] = vmlaq_lane_s32(c0.val[2], ib0.val[0], ia.val[1], 0); + c0.val[3] = vmlaq_lane_s32(c0.val[3], ib0.val[0], ia.val[1], 1); + + // 4x4 block 1 + c1.val[0] = vmlaq_lane_s32(c1.val[0], ib0.val[1], ia.val[0], 0); + c1.val[1] = vmlaq_lane_s32(c1.val[1], ib0.val[1], ia.val[0], 1); + c1.val[2] = vmlaq_lane_s32(c1.val[2], ib0.val[1], ia.val[1], 0); + c1.val[3] = vmlaq_lane_s32(c1.val[3], ib0.val[1], ia.val[1], 1); + + // 4x4 block 2 + c2.val[0] = vmlaq_lane_s32(c2.val[0], ib0.val[2], ia.val[0], 0); + c2.val[1] = vmlaq_lane_s32(c2.val[1], ib0.val[2], ia.val[0], 1); + c2.val[2] = vmlaq_lane_s32(c2.val[2], ib0.val[2], ia.val[1], 0); + c2.val[3] = vmlaq_lane_s32(c2.val[3], ib0.val[2], ia.val[1], 1); + + // 4x4 block 3 + c3.val[0] = vmlaq_lane_s32(c3.val[0], ib0.val[3], ia.val[0], 0); + c3.val[1] = vmlaq_lane_s32(c3.val[1], ib0.val[3], ia.val[0], 1); + c3.val[2] = vmlaq_lane_s32(c3.val[2], ib0.val[3], ia.val[1], 0); + c3.val[3] = vmlaq_lane_s32(c3.val[3], ib0.val[3], ia.val[1], 1); + } + + c0.val[0] = vshlq_s32(vmulq_n_s32(c0.val[0], _output_mult_int), vshiftr); + c0.val[1] = vshlq_s32(vmulq_n_s32(c0.val[1], _output_mult_int), vshiftr); + c0.val[2] = vshlq_s32(vmulq_n_s32(c0.val[2], _output_mult_int), vshiftr); + c0.val[3] = vshlq_s32(vmulq_n_s32(c0.val[3], _output_mult_int), vshiftr); + + c1.val[0] = vshlq_s32(vmulq_n_s32(c1.val[0], _output_mult_int), vshiftr); + c1.val[1] = vshlq_s32(vmulq_n_s32(c1.val[1], _output_mult_int), vshiftr); + c1.val[2] = vshlq_s32(vmulq_n_s32(c1.val[2], _output_mult_int), vshiftr); + c1.val[3] = vshlq_s32(vmulq_n_s32(c1.val[3], _output_mult_int), vshiftr); + + c2.val[0] = vshlq_s32(vmulq_n_s32(c2.val[0], _output_mult_int), vshiftr); + c2.val[1] = vshlq_s32(vmulq_n_s32(c2.val[1], _output_mult_int), vshiftr); + c2.val[2] = vshlq_s32(vmulq_n_s32(c2.val[2], _output_mult_int), vshiftr); + c2.val[3] = vshlq_s32(vmulq_n_s32(c2.val[3], _output_mult_int), vshiftr); + + c3.val[0] = vshlq_s32(vmulq_n_s32(c3.val[0], _output_mult_int), vshiftr); + c3.val[1] = vshlq_s32(vmulq_n_s32(c3.val[1], _output_mult_int), vshiftr); + c3.val[2] = vshlq_s32(vmulq_n_s32(c3.val[2], _output_mult_int), vshiftr); + c3.val[3] = vshlq_s32(vmulq_n_s32(c3.val[3], _output_mult_int), vshiftr); + + const uint8x16x4_t r = + { + { + vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[0]), vqmovn_s32(c1.val[0]))), + vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[0]), vqmovn_s32(c3.val[0])))), + vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[1]), vqmovn_s32(c1.val[1]))), + vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[1]), vqmovn_s32(c3.val[1])))), + vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[2]), vqmovn_s32(c1.val[2]))), + vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[2]), vqmovn_s32(c3.val[2])))), + vcombine_u8(vqmovun_s16(vcombine_s16(vqmovn_s32(c0.val[3]), vqmovn_s32(c1.val[3]))), + vqmovun_s16(vcombine_s16(vqmovn_s32(c2.val[3]), vqmovn_s32(c3.val[3])))) + } + }; + + uint8_t *const mtx_out = out.ptr(); + vst1q_u8(mtx_out + 0 * out_stride, r.val[0]); + vst1q_u8(mtx_out + 1 * out_stride, r.val[1]); + vst1q_u8(mtx_out + 2 * out_stride, r.val[2]); + vst1q_u8(mtx_out + 3 * out_stride, r.val[3]); + }, + ina, inb, out); +} diff --git a/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp new file mode 100644 index 0000000000..7a3bae50c0 --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.cpp @@ -0,0 +1,128 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +NEGEMMMatrixAccumulateBiasesKernel::NEGEMMMatrixAccumulateBiasesKernel() + : _accum(nullptr), _biases(nullptr) +{ +} + +void NEGEMMMatrixAccumulateBiasesKernel::configure(ITensor *accum, const ITensor *biases) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(accum, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(biases, accum); + ARM_COMPUTE_ERROR_ON(biases->info()->num_dimensions() != 1); + + _biases = biases; + _accum = accum; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*accum->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic biases_access(biases->info(), 0, 0, biases->info()->dimension(0), biases->info()->dimension(1)); + + update_window_and_padding(win, + AccessWindowHorizontal(accum->info(), 0, num_elems_processed_per_iteration), + biases_access); + + AccessWindowHorizontal output_access(accum->info(), 0, num_elems_processed_per_iteration); + + // Set the valid region for the accum tensor + Coordinates coord; + coord.set_num_dimensions(accum->info()->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, accum->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEGEMMMatrixAccumulateBiasesKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window win_biases; + win_biases.set(Window::DimX, Window::Dimension(window.x().start(), window.x().end(), window.x().step())); + win_biases.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator in0_out(_accum, window); + Iterator in1(_biases, win_biases); + + switch(_accum->info()->data_type()) + { + case DataType::F32: + { + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4x4_t accum = vld4q_f32(reinterpret_cast(in0_out.ptr())); + const float32x4x4_t biases = vld4q_f32(reinterpret_cast(in1.ptr())); + const float32x4x4_t res = + { + { + vaddq_f32(accum.val[0], biases.val[0]), + vaddq_f32(accum.val[1], biases.val[1]), + vaddq_f32(accum.val[2], biases.val[2]), + vaddq_f32(accum.val[3], biases.val[3]) + } + }; + + vst4q_f32(reinterpret_cast(in0_out.ptr()), res); + }, + in0_out, in1); + break; + } + case DataType::QS8: + { + execute_window_loop(window, [&](const Coordinates & id) + { + const qint8x16_t accum = vld1q_qs8(reinterpret_cast(in0_out.ptr())); + const qint8x16_t biases = vld1q_qs8(reinterpret_cast(in1.ptr())); + + vst1q_qs8(reinterpret_cast(in0_out.ptr()), vqaddq_qs8(accum, biases)); + }, + in0_out, in1); + break; + } + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } +} diff --git a/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp new file mode 100644 index 0000000000..71dd4c7aa1 --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMMatrixAdditionKernel.cpp @@ -0,0 +1,202 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +void matrix_addition_f32(const ITensor *input, ITensor *output, const Window &window, float beta) +{ + const float32x4_t beta_f32 = vdupq_n_f32(beta); + + Iterator in(input, window); + Iterator out(output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); + + float32x4x4_t alpha_ab = + { + { + vld1q_f32(out_ptr + 0), + vld1q_f32(out_ptr + 4), + vld1q_f32(out_ptr + 8), + vld1q_f32(out_ptr + 12) + } + }; + + const float32x4x4_t c = + { + { + vld1q_f32(in_ptr + 0), + vld1q_f32(in_ptr + 4), + vld1q_f32(in_ptr + 8), + vld1q_f32(in_ptr + 12) + } + }; + + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vmlaq_f32(alpha_ab.val[0], c.val[0], beta_f32); + alpha_ab.val[1] = vmlaq_f32(alpha_ab.val[1], c.val[1], beta_f32); + alpha_ab.val[2] = vmlaq_f32(alpha_ab.val[2], c.val[2], beta_f32); + alpha_ab.val[3] = vmlaq_f32(alpha_ab.val[3], c.val[3], beta_f32); + + vst1q_f32(out_ptr + 0, alpha_ab.val[0]); + vst1q_f32(out_ptr + 4, alpha_ab.val[1]); + vst1q_f32(out_ptr + 8, alpha_ab.val[2]); + vst1q_f32(out_ptr + 12, alpha_ab.val[3]); + }, + in, out); +} + +#ifdef ARM_COMPUTE_ENABLE_FP16 +void matrix_addition_f16(const ITensor *input, ITensor *output, const Window &window, float beta) +{ + const float16x8_t beta_f16 = vdupq_n_f16(beta); + + Iterator in(input, window); + Iterator out(output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); + + float16x8x2_t alpha_ab = + { + { + vld1q_f16(out_ptr + 0), + vld1q_f16(out_ptr + 8) + } + }; + + float16x8x2_t c = + { + { + vld1q_f16(in_ptr + 0), + vld1q_f16(in_ptr + 8) + } + }; + + // Multiply matrix C by its weight and accumulate + alpha_ab.val[0] = vaddq_f16(alpha_ab.val[0], vmulq_f16(c.val[0], beta_f16)); + alpha_ab.val[1] = vaddq_f16(alpha_ab.val[1], vmulq_f16(c.val[1], beta_f16)); + + vst1q_f16(out_ptr + 0, alpha_ab.val[0]); + vst1q_f16(out_ptr + 8, alpha_ab.val[1]); + }, + in, out); +} +#endif + +void matrix_addition_qs8(const ITensor *input, ITensor *output, const Window &window, float beta) +{ + const int fixed_point_position = input->info()->fixed_point_position(); + const qint8x16_t beta_qs8 = vdupq_n_qs8(scvt_qs8_f32(beta, fixed_point_position)); + + Iterator in(input, window); + Iterator out(output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); + + qint8x16_t alpha_ab = vld1q_qs8(out_ptr); + const qint8x16_t c = vld1q_qs8(in_ptr); + + // Multiply matrix C by its weight and accumulate + alpha_ab = vqmlaq_qs8(alpha_ab, c, beta_qs8, fixed_point_position); + + vst1q_qs8(out_ptr, alpha_ab); + }, + in, out); +} +} // namespace + +NEGEMMMatrixAdditionKernel::NEGEMMMatrixAdditionKernel() + : INESimpleKernel(), _func(nullptr), _beta(0.0f) +{ +} + +void NEGEMMMatrixAdditionKernel::configure(const ITensor *input, ITensor *output, float beta) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); + + switch(input->info()->data_type()) + { + case DataType::F32: + _func = &matrix_addition_f32; + break; + case DataType::QS8: + _func = &matrix_addition_qs8; + break; + case DataType::F16: +#ifdef ARM_COMPUTE_ENABLE_FP16 + _func = &matrix_addition_f16; + break; +#endif + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + INESimpleKernel::configure(input, output, num_elems_processed_per_iteration); + + _beta = beta; +} + +void NEGEMMMatrixAdditionKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + if(_beta != 0.0f) + { + (*_func)(_input, _output, window, _beta); + } +} diff --git a/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..dcfbb13081 --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.cpp @@ -0,0 +1,1168 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +template +void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) +{ + const auto width_matrix_b = static_cast(output->info()->dimension(0)); + const auto in_b_stride = static_cast(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type())); + const auto num_elems_vec_a = static_cast(input0->info()->dimension(0)); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * window.thread_id(); + const int window_step_x = 16 * window.num_threads(); + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(input1->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(input0, win_a); + Iterator inb(input1, win_b); + Iterator out(output, win_out); + + execute_window_loop(win_out, [&](const Coordinates & id) + { + if(id.x() > width_matrix_b) + { + return; + } + + float32x4_t acc0 = vdupq_n_f32(0.f); + float32x4_t acc1 = vdupq_n_f32(0.f); + float32x4_t acc2 = vdupq_n_f32(0.f); + float32x4_t acc3 = vdupq_n_f32(0.f); + + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); +#endif + + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for(; vec_a <= (vec_a_end_addr - 4);) + { + float32x2_t a0l = vld1_f32(vec_a); + + float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); +#endif + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + + a0l = vld1_f32(vec_a); + + b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + } + + for(; vec_a < vec_a_end_addr;) + { + const float a0 = *vec_a; + + const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + acc0 = vmlaq_n_f32(acc0, b00, a0); + acc1 = vmlaq_n_f32(acc1, b01, a0); + acc2 = vmlaq_n_f32(acc2, b02, a0); + acc3 = vmlaq_n_f32(acc3, b03, a0); + + vec_a += 1; + matrix_b += in_b_stride; + } + + // Multiply by the weight of matrix product (alpha) + if(multiply_alpha) + { + const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + acc0 = vmulq_f32(acc0, alpha_f32); + acc1 = vmulq_f32(acc1, alpha_f32); + acc2 = vmulq_f32(acc2, alpha_f32); + acc3 = vmulq_f32(acc3, alpha_f32); + } + + const auto vec_out = reinterpret_cast(out.ptr()); + + vst1q_f32(vec_out + 0, acc0); + vst1q_f32(vec_out + 4, acc1); + vst1q_f32(vec_out + 8, acc2); + vst1q_f32(vec_out + 12, acc3); + }, + ina, inb, out); +} + +template +void vector_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) +{ + const auto width_matrix_b = static_cast(output->info()->dimension(0)); + const auto in_b_stride = static_cast(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type())); + const auto num_elems_vec_a = static_cast(input0->info()->dimension(0)); + const int fixed_point_position = input0->info()->fixed_point_position(); + + // The implementation computes 32 elements per iteration + const int window_start_x = 32 * window.thread_id(); + const int window_step_x = 32 * window.num_threads(); + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_out.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(input1->info()->num_dimensions() >= 3) + { + win_b = window; + } + win_b.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + Iterator ina(input0, win_a); + Iterator inb(input1, win_b); + Iterator out(output, win_out); + + execute_window_loop(win_out, [&](const Coordinates & id) + { + if(id.x() > width_matrix_b) + { + return; + } + + // Reset accumulators + qint16x8_t acc00_qs16 = vdupq_n_qs16(0); + qint16x8_t acc01_qs16 = vdupq_n_qs16(0); + qint16x8_t acc02_qs16 = vdupq_n_qs16(0); + qint16x8_t acc03_qs16 = vdupq_n_qs16(0); + + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(inb.ptr()); + + auto vec_a_end_addr = vec_a + num_elems_vec_a; + for(; vec_a <= (vec_a_end_addr - 2);) + { + const qint8x8_t a0 = vld1_dup_qs8(vec_a + 0); + const qint8x8_t a1 = vld1_dup_qs8(vec_a + 1); + + const qint8x8_t b00 = vld1_qs8(matrix_b + 0 + 0 * in_b_stride); + const qint8x8_t b01 = vld1_qs8(matrix_b + 8 + 0 * in_b_stride); + const qint8x8_t b02 = vld1_qs8(matrix_b + 16 + 0 * in_b_stride); + const qint8x8_t b03 = vld1_qs8(matrix_b + 24 + 0 * in_b_stride); + const qint8x8_t b10 = vld1_qs8(matrix_b + 0 + 1 * in_b_stride); + const qint8x8_t b11 = vld1_qs8(matrix_b + 8 + 1 * in_b_stride); + const qint8x8_t b12 = vld1_qs8(matrix_b + 16 + 1 * in_b_stride); + const qint8x8_t b13 = vld1_qs8(matrix_b + 24 + 1 * in_b_stride); + + // First accumulation + acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); + acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); + acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position); + acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position); + + // Second accumulation + acc00_qs16 = vqmlal_qs8(acc00_qs16, b10, a1, fixed_point_position); + acc01_qs16 = vqmlal_qs8(acc01_qs16, b11, a1, fixed_point_position); + acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a1, fixed_point_position); + acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a1, fixed_point_position); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + } + + for(; vec_a < vec_a_end_addr;) + { + const qint8x8_t a0 = vld1_dup_qs8(vec_a); + + const qint8x8_t b00 = vld1_qs8(matrix_b + 0); + const qint8x8_t b01 = vld1_qs8(matrix_b + 8); + const qint8x8_t b02 = vld1_qs8(matrix_b + 16); + const qint8x8_t b03 = vld1_qs8(matrix_b + 24); + + acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); + acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); + acc02_qs16 = vqmlal_qs8(acc02_qs16, b02, a0, fixed_point_position); + acc03_qs16 = vqmlal_qs8(acc03_qs16, b03, a0, fixed_point_position); + + vec_a += 1; + matrix_b += in_b_stride; + } + + // Convert back to qint8x8_t and saturate + qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16); + qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16); + qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16); + qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16); + + // Multiply by the weight of the matrix product (alpha) + if(multiply_alpha) + { + const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position)); + acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position); + acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position); + acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position); + acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position); + } + + const auto mtx_out0 = reinterpret_cast(out.ptr()); + + // Store 8x4 output elements + vst1_qs8(mtx_out0 + 0, acc00_qs8); + vst1_qs8(mtx_out0 + 8, acc01_qs8); + vst1_qs8(mtx_out0 + 16, acc02_qs8); + vst1_qs8(mtx_out0 + 24, acc03_qs8); + }, + ina, inb, out); +} + +template +void matrix_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) +{ + const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()); + const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type()); + const size_t out_stride2 = out_stride1 * 2; + const size_t out_stride3 = out_stride1 * 3; + const int num_elems_matrix_b_x = input1->info()->dimension(0); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(input1->info()->num_dimensions() >= 3) + { + win_b = window; + } + // Set step_x and step_y for matrix B. Scale by a factor of 4 the X range as the input transposed matrix A has 4 times less the cols of the output matrix + // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 4x4 + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 4, window.x().end() / 4, 2 * in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator ina(input0, win_a); + Iterator inb(input1, win_b); + Iterator out(output, window); + + // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW + // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration + // All the values needed for computing a single 4x4 block will be read from consecutive memory positions + execute_window_loop(window, [&](const Coordinates & id) + { + auto mtx_a0 = reinterpret_cast(ina.ptr()); + auto mtx_b0 = reinterpret_cast(inb.ptr()); + auto mtx_b1 = mtx_b0 + in_b_stride; + + float32x4_t acc00 = vdupq_n_f32(0.f); + float32x4_t acc10 = vdupq_n_f32(0.f); + float32x4_t acc20 = vdupq_n_f32(0.f); + float32x4_t acc30 = vdupq_n_f32(0.f); + + float32x4_t acc01 = vdupq_n_f32(0.f); + float32x4_t acc11 = vdupq_n_f32(0.f); + float32x4_t acc21 = vdupq_n_f32(0.f); + float32x4_t acc31 = vdupq_n_f32(0.f); + +#if __arm__ + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(mtx_b1))); +#endif + + auto mtx_b0_end_addr = mtx_b0 + num_elems_matrix_b_x; + for(; mtx_b0 <= (mtx_b0_end_addr - 32);) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + float32x4_t b01 = vld1q_f32(mtx_b0 + 4); + float32x4_t b11 = vld1q_f32(mtx_b1 + 4); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); +#endif + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + float32x4_t a4 = vld1q_dup_f32(mtx_a0 + 4); + float32x4_t a5 = vld1q_dup_f32(mtx_a0 + 5); + float32x4_t a6 = vld1q_dup_f32(mtx_a0 + 6); + float32x4_t a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(mtx_b1))); +#endif + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + + a0 = vld1q_dup_f32(mtx_a0 + 0); + a1 = vld1q_dup_f32(mtx_a0 + 1); + a2 = vld1q_dup_f32(mtx_a0 + 2); + a3 = vld1q_dup_f32(mtx_a0 + 3); + b00 = vld1q_f32(mtx_b0); + b10 = vld1q_f32(mtx_b1); + b01 = vld1q_f32(mtx_b0 + 4); + b11 = vld1q_f32(mtx_b1 + 4); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + a4 = vld1q_dup_f32(mtx_a0 + 4); + a5 = vld1q_dup_f32(mtx_a0 + 5); + a6 = vld1q_dup_f32(mtx_a0 + 6); + a7 = vld1q_dup_f32(mtx_a0 + 7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b01, a4); + acc10 = vmlaq_f32(acc10, b01, a5); + acc20 = vmlaq_f32(acc20, b01, a6); + acc30 = vmlaq_f32(acc30, b01, a7); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b11, a4); + acc11 = vmlaq_f32(acc11, b11, a5); + acc21 = vmlaq_f32(acc21, b11, a6); + acc31 = vmlaq_f32(acc31, b11, a7); + + mtx_a0 += 8; + mtx_b0 += 8; + mtx_b1 += 8; + } + + for(; mtx_b0 < mtx_b0_end_addr;) + { + float32x4_t a0 = vld1q_dup_f32(mtx_a0 + 0); + float32x4_t a1 = vld1q_dup_f32(mtx_a0 + 1); + float32x4_t a2 = vld1q_dup_f32(mtx_a0 + 2); + float32x4_t a3 = vld1q_dup_f32(mtx_a0 + 3); + float32x4_t b00 = vld1q_f32(mtx_b0); + float32x4_t b10 = vld1q_f32(mtx_b1); + +#if __arm__ + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b1))); +#endif + // 4x4 block 0 + acc00 = vmlaq_f32(acc00, b00, a0); + acc10 = vmlaq_f32(acc10, b00, a1); + acc20 = vmlaq_f32(acc20, b00, a2); + acc30 = vmlaq_f32(acc30, b00, a3); + + // 4x4 block 1 + acc01 = vmlaq_f32(acc01, b10, a0); + acc11 = vmlaq_f32(acc11, b10, a1); + acc21 = vmlaq_f32(acc21, b10, a2); + acc31 = vmlaq_f32(acc31, b10, a3); + + mtx_a0 += 4; + mtx_b0 += 4; + mtx_b1 += 4; + } + + // Multiply by the weight of matrix product (alpha) + if(multiply_alpha) + { + const float32x4_t alpha_f32 = vdupq_n_f32(alpha); + acc00 = vmulq_f32(acc00, alpha_f32); + acc10 = vmulq_f32(acc10, alpha_f32); + acc20 = vmulq_f32(acc20, alpha_f32); + acc30 = vmulq_f32(acc30, alpha_f32); + acc01 = vmulq_f32(acc01, alpha_f32); + acc11 = vmulq_f32(acc11, alpha_f32); + acc21 = vmulq_f32(acc21, alpha_f32); + acc31 = vmulq_f32(acc31, alpha_f32); + } + + const auto mtx_out0 = reinterpret_cast(out.ptr()); + const auto mtx_out1 = mtx_out0 + 4; + + // Store the 4 blocks + vst1q_f32(mtx_out0, acc00); + vst1q_f32(mtx_out1, acc01); + vst1q_f32(mtx_out0 + out_stride1, acc10); + vst1q_f32(mtx_out1 + out_stride1, acc11); + vst1q_f32(mtx_out0 + out_stride2, acc20); + vst1q_f32(mtx_out1 + out_stride2, acc21); + vst1q_f32(mtx_out0 + out_stride3, acc30); + vst1q_f32(mtx_out1 + out_stride3, acc31); + }, + ina, inb, out); +} + +template +void matrix_matrix_multiply_f16(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) +{ +#ifdef ARM_COMPUTE_ENABLE_FP16 + const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()); + const size_t out_stride = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type()); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(input1->info()->num_dimensions() >= 3) + { + win_b = window; + } + // Set step_x and step_y for matrix B. Scale by a factor of 8 the X range as the input transposed matrix A has 8 times less the cols of the output matrix + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 8, window.x().end() / 8, in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 1, 0)); + + Iterator ina(input0, win_a); + Iterator inb(input1, win_b); + Iterator out(output, window); + + // Number of iterations of inner loop. Since 8 is the number of accumulations per loop, num_it = (width_mtx_b / 4) / 8 + const size_t num_it = ((input1->info()->dimension(0)) >> 2) >> 3; + + const float16x8_t alpha_f16 = vdupq_n_f16(alpha); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto *mtx_a0 = reinterpret_cast(ina.ptr()); + const auto *mtx_b0 = reinterpret_cast(inb.ptr()); + auto *mtx_out = reinterpret_cast(out.ptr()); + float16x8x4_t c = + { + { + vdupq_n_f16(0.f), + vdupq_n_f16(0.f), + vdupq_n_f16(0.f), + vdupq_n_f16(0.f) + } + }; + + /* + This kernel puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + |a00 a01 a02 a03 | a04 a05 a06 a07| + |a10 a11 a12 a13 | a14 a15 a16 a17| + |a20 a21 a22 a23 | a24 a25 a26 a27| = | a00 a10 a20 a30 || a01 a11 a21 a31 || a02 a12 a22 a32 || a03 a13 a23 a33 | a40 a50 a60 a70 | ... + |a30 a31 a32 a33 | a34 a35 a36 a37| | a04 a14 a24 a34 || a05 a15 a25 a35 || a06 a15 a26 a36 || a07 a17 a27 a37 | a44 a54 a64 a74 | ... + |a40 a41 a42 a43 | a44 a45 a46 a47| + |a50 a51 a52 a53 | a54 a55 a56 a57| + |a60 a61 a62 a63 | a64 a65 a66 a67| + |a70 a71 a72 a73 | a74 a75 a76 a77| + + After this operation, the output matrix will have the following shape: [ height * 4, width / 4 ] + + B Matrix has been transposed as shown below + + |b00 b01 b02 b03 b04 b05 b06 b07| + |b10 b11 b12 b13 b14 b15 b16 b17| + |b20 b21 b22 b23 b24 b25 b26 b27| + |b30 b31 b32 b33 b34 b35 b36 b37| + -------------------> + + |b00 b01 b02 b03 b04 b05 b06 b07||b10 b11 b12 b13 b14 b15 b16 b17||b20 b21 b22 b23 b24 b25 b26 b27||b30 b31 b32 b33 b34 b35 b36 b37| + + c.val[0][0] = a00*b00 + a01*b10 + a02*b20 + a03*b30 + c.val[0][1] = a00*b01 + a01*b11 + a02*b21 + a03*b31 + + The size of the output tensor's XY-plane must be the following shape [ width * 8, height / 8 ]. All other dimensions must have the same size. + */ + for(size_t k = num_it; k > 0; mtx_a0 += 16, mtx_b0 += 32, --k) + { + const float16x8_t p00 = vld1q_f16(mtx_a0); + const float16x8_t p02 = vld1q_f16(mtx_a0 + 8); + const float16x8_t q00 = vld1q_f16(mtx_b0); + const float16x8_t q02 = vld1q_f16(mtx_b0 + 8); + const float16x8_t q04 = vld1q_f16(mtx_b0 + 16); + const float16x8_t q06 = vld1q_f16(mtx_b0 + 24); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q00, vgetq_lane_f16(p00, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q00, vgetq_lane_f16(p00, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q00, vgetq_lane_f16(p00, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q00, vgetq_lane_f16(p00, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q02, vgetq_lane_f16(p00, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q02, vgetq_lane_f16(p00, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q02, vgetq_lane_f16(p00, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q02, vgetq_lane_f16(p00, 7))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q04, vgetq_lane_f16(p02, 0))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q04, vgetq_lane_f16(p02, 1))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q04, vgetq_lane_f16(p02, 2))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q04, vgetq_lane_f16(p02, 3))); + + c.val[0] = vaddq_f16(c.val[0], vmulq_n_f16(q06, vgetq_lane_f16(p02, 4))); + c.val[1] = vaddq_f16(c.val[1], vmulq_n_f16(q06, vgetq_lane_f16(p02, 5))); + c.val[2] = vaddq_f16(c.val[2], vmulq_n_f16(q06, vgetq_lane_f16(p02, 6))); + c.val[3] = vaddq_f16(c.val[3], vmulq_n_f16(q06, vgetq_lane_f16(p02, 7))); + } + + if(multiply_alpha) + { + c.val[0] = vmulq_f16(c.val[0], alpha_f16); + c.val[1] = vmulq_f16(c.val[1], alpha_f16); + c.val[2] = vmulq_f16(c.val[2], alpha_f16); + c.val[3] = vmulq_f16(c.val[3], alpha_f16); + } + + vst1q_f16(mtx_out + 0 * out_stride, c.val[0]); + vst1q_f16(mtx_out + 1 * out_stride, c.val[1]); + vst1q_f16(mtx_out + 2 * out_stride, c.val[2]); + vst1q_f16(mtx_out + 3 * out_stride, c.val[3]); + }, + ina, inb, out); +#else + ARM_COMPUTE_ERROR("Not implemented"); +#endif +} + +template +void matrix_matrix_multiply_qs8(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window, float alpha) +{ + const size_t in_b_stride = input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type()); + const size_t out_stride1 = output->info()->strides_in_bytes()[1] / data_size_from_type(output->info()->data_type()); + const size_t out_stride2 = out_stride1 * 2; + const size_t out_stride3 = out_stride1 * 3; + const int num_elems_matrix_b_x = input1->info()->dimension(0); + const int fixed_point_position = input0->info()->fixed_point_position(); + const qint8x8_t alpha_qs8 = vdup_n_qs8(scvt_qs8_f32(alpha, fixed_point_position)); + ARM_COMPUTE_UNUSED(alpha_qs8); + + // Set step_x and step_y for matrix A. Scale by a factor of 4 the Y range as the input interleaved matrix A has 4 times less the rows of the output matrix + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_a.set(Window::DimY, Window::Dimension(window.y().start() / 4, std::max(window.y().end() / 4, 1), 1)); + + Window win_b; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the the matrix multiplication is used to perform a convolution operation + if(input1->info()->num_dimensions() >= 3) + { + win_b = window; + } + // Set step_x and step_y for matrix B. Scale by a factor of 16 the X range as the input transposed matrix A has 16 times less the cols of the output matrix + // The step along the x direction is 2 times the in_b_stride because for each iteration we compute 2 blocks of size 16x4 + win_b.set(Window::DimX, Window::Dimension(window.x().start() / 16, window.x().end() / 16, 2 * in_b_stride)); + win_b.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator ina(input0, win_a); + Iterator inb(input1, win_b); + Iterator out(output, window); + + // The implementation assumes that the matrix A and Matrix B have been reshaped respectively with NEGEMMInterleave4x4 and NEGEMMTranspose1xW + // The reshaping of the matrices helps to have a cache friendly implementation and helps to avoid the data re-arrangements needed for computing 16x4 elements per iteration + // All the values needed for computing a single 32x4 block will be read from consecutive memory positions + execute_window_loop(window, [&](const Coordinates & id) + { + auto mtx_a0 = reinterpret_cast(ina.ptr()); + auto mtx_b0 = reinterpret_cast(inb.ptr()); + auto mtx_b1 = mtx_b0 + in_b_stride; + + qint16x8_t acc00_qs16 = vdupq_n_qs16(0); + qint16x8_t acc10_qs16 = vdupq_n_qs16(0); + qint16x8_t acc20_qs16 = vdupq_n_qs16(0); + qint16x8_t acc30_qs16 = vdupq_n_qs16(0); + + qint16x8_t acc01_qs16 = vdupq_n_qs16(0); + qint16x8_t acc11_qs16 = vdupq_n_qs16(0); + qint16x8_t acc21_qs16 = vdupq_n_qs16(0); + qint16x8_t acc31_qs16 = vdupq_n_qs16(0); + + qint16x8_t acc02_qs16 = vdupq_n_qs16(0); + qint16x8_t acc12_qs16 = vdupq_n_qs16(0); + qint16x8_t acc22_qs16 = vdupq_n_qs16(0); + qint16x8_t acc32_qs16 = vdupq_n_qs16(0); + + qint16x8_t acc03_qs16 = vdupq_n_qs16(0); + qint16x8_t acc13_qs16 = vdupq_n_qs16(0); + qint16x8_t acc23_qs16 = vdupq_n_qs16(0); + qint16x8_t acc33_qs16 = vdupq_n_qs16(0); + + int k = 0; + // This for loop performs 2 accumulations + for(; k <= (num_elems_matrix_b_x - 32); k += 32) + { + const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0); + const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1); + const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2); + const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3); + const qint8x8_t a4 = vld1_dup_qs8(mtx_a0 + 4); + const qint8x8_t a5 = vld1_dup_qs8(mtx_a0 + 5); + const qint8x8_t a6 = vld1_dup_qs8(mtx_a0 + 6); + const qint8x8_t a7 = vld1_dup_qs8(mtx_a0 + 7); + + const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0); + const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8); + const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0); + const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8); + + // First accumulation + acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); + acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position); + acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position); + acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position); + acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position); + acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position); + acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position); + acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position); + + const qint8x8_t b02 = vld1_qs8(mtx_b0 + 16); + const qint8x8_t b03 = vld1_qs8(mtx_b0 + 24); + const qint8x8_t b12 = vld1_qs8(mtx_b1 + 16); + const qint8x8_t b13 = vld1_qs8(mtx_b1 + 24); + + acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); + acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position); + acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position); + acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position); + acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position); + acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position); + acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position); + acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position); + +#if __arm__ + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_a0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b0))); + asm volatile("PLD [%0, #128*2]" ::"r"(reinterpret_cast(mtx_b1))); +#endif + + // Second accumulation + acc00_qs16 = vqmlal_qs8(acc00_qs16, b02, a4, fixed_point_position); + acc10_qs16 = vqmlal_qs8(acc10_qs16, b02, a5, fixed_point_position); + acc20_qs16 = vqmlal_qs8(acc20_qs16, b02, a6, fixed_point_position); + acc30_qs16 = vqmlal_qs8(acc30_qs16, b02, a7, fixed_point_position); + acc01_qs16 = vqmlal_qs8(acc01_qs16, b03, a4, fixed_point_position); + acc11_qs16 = vqmlal_qs8(acc11_qs16, b03, a5, fixed_point_position); + acc21_qs16 = vqmlal_qs8(acc21_qs16, b03, a6, fixed_point_position); + acc31_qs16 = vqmlal_qs8(acc31_qs16, b03, a7, fixed_point_position); + acc02_qs16 = vqmlal_qs8(acc02_qs16, b12, a4, fixed_point_position); + acc12_qs16 = vqmlal_qs8(acc12_qs16, b12, a5, fixed_point_position); + acc22_qs16 = vqmlal_qs8(acc22_qs16, b12, a6, fixed_point_position); + acc32_qs16 = vqmlal_qs8(acc32_qs16, b12, a7, fixed_point_position); + acc03_qs16 = vqmlal_qs8(acc03_qs16, b13, a4, fixed_point_position); + acc13_qs16 = vqmlal_qs8(acc13_qs16, b13, a5, fixed_point_position); + acc23_qs16 = vqmlal_qs8(acc23_qs16, b13, a6, fixed_point_position); + acc33_qs16 = vqmlal_qs8(acc33_qs16, b13, a7, fixed_point_position); + + mtx_a0 += 8; + mtx_b0 += 32; + mtx_b1 += 32; + } + + // This for loop performs the left over accumulations + for(; k < num_elems_matrix_b_x; k += 16) + { + const qint8x8_t a0 = vld1_dup_qs8(mtx_a0 + 0); + const qint8x8_t a1 = vld1_dup_qs8(mtx_a0 + 1); + const qint8x8_t a2 = vld1_dup_qs8(mtx_a0 + 2); + const qint8x8_t a3 = vld1_dup_qs8(mtx_a0 + 3); + + const qint8x8_t b00 = vld1_qs8(mtx_b0 + 0); + const qint8x8_t b01 = vld1_qs8(mtx_b0 + 8); + const qint8x8_t b10 = vld1_qs8(mtx_b1 + 0); + const qint8x8_t b11 = vld1_qs8(mtx_b1 + 8); + + acc00_qs16 = vqmlal_qs8(acc00_qs16, b00, a0, fixed_point_position); + acc10_qs16 = vqmlal_qs8(acc10_qs16, b00, a1, fixed_point_position); + acc20_qs16 = vqmlal_qs8(acc20_qs16, b00, a2, fixed_point_position); + acc30_qs16 = vqmlal_qs8(acc30_qs16, b00, a3, fixed_point_position); + acc01_qs16 = vqmlal_qs8(acc01_qs16, b01, a0, fixed_point_position); + acc11_qs16 = vqmlal_qs8(acc11_qs16, b01, a1, fixed_point_position); + acc21_qs16 = vqmlal_qs8(acc21_qs16, b01, a2, fixed_point_position); + acc31_qs16 = vqmlal_qs8(acc31_qs16, b01, a3, fixed_point_position); + acc02_qs16 = vqmlal_qs8(acc02_qs16, b10, a0, fixed_point_position); + acc12_qs16 = vqmlal_qs8(acc12_qs16, b10, a1, fixed_point_position); + acc22_qs16 = vqmlal_qs8(acc22_qs16, b10, a2, fixed_point_position); + acc32_qs16 = vqmlal_qs8(acc32_qs16, b10, a3, fixed_point_position); + acc03_qs16 = vqmlal_qs8(acc03_qs16, b11, a0, fixed_point_position); + acc13_qs16 = vqmlal_qs8(acc13_qs16, b11, a1, fixed_point_position); + acc23_qs16 = vqmlal_qs8(acc23_qs16, b11, a2, fixed_point_position); + acc33_qs16 = vqmlal_qs8(acc33_qs16, b11, a3, fixed_point_position); + + mtx_a0 += 4; + mtx_b0 += 16; + mtx_b1 += 16; + } + + // Convert back to qint8x8_t and saturate + qint8x8_t acc00_qs8 = vqmovn_qs16(acc00_qs16); + qint8x8_t acc10_qs8 = vqmovn_qs16(acc10_qs16); + qint8x8_t acc20_qs8 = vqmovn_qs16(acc20_qs16); + qint8x8_t acc30_qs8 = vqmovn_qs16(acc30_qs16); + + qint8x8_t acc01_qs8 = vqmovn_qs16(acc01_qs16); + qint8x8_t acc11_qs8 = vqmovn_qs16(acc11_qs16); + qint8x8_t acc21_qs8 = vqmovn_qs16(acc21_qs16); + qint8x8_t acc31_qs8 = vqmovn_qs16(acc31_qs16); + + qint8x8_t acc02_qs8 = vqmovn_qs16(acc02_qs16); + qint8x8_t acc12_qs8 = vqmovn_qs16(acc12_qs16); + qint8x8_t acc22_qs8 = vqmovn_qs16(acc22_qs16); + qint8x8_t acc32_qs8 = vqmovn_qs16(acc32_qs16); + + qint8x8_t acc03_qs8 = vqmovn_qs16(acc03_qs16); + qint8x8_t acc13_qs8 = vqmovn_qs16(acc13_qs16); + qint8x8_t acc23_qs8 = vqmovn_qs16(acc23_qs16); + qint8x8_t acc33_qs8 = vqmovn_qs16(acc33_qs16); + + // Multiply by the weight of the matrix product (alpha) + if(multiply_alpha) + { + acc00_qs8 = vqmul_qs8(acc00_qs8, alpha_qs8, fixed_point_position); + acc10_qs8 = vqmul_qs8(acc10_qs8, alpha_qs8, fixed_point_position); + acc20_qs8 = vqmul_qs8(acc20_qs8, alpha_qs8, fixed_point_position); + acc30_qs8 = vqmul_qs8(acc30_qs8, alpha_qs8, fixed_point_position); + acc01_qs8 = vqmul_qs8(acc01_qs8, alpha_qs8, fixed_point_position); + acc11_qs8 = vqmul_qs8(acc11_qs8, alpha_qs8, fixed_point_position); + acc21_qs8 = vqmul_qs8(acc21_qs8, alpha_qs8, fixed_point_position); + acc31_qs8 = vqmul_qs8(acc31_qs8, alpha_qs8, fixed_point_position); + acc02_qs8 = vqmul_qs8(acc02_qs8, alpha_qs8, fixed_point_position); + acc12_qs8 = vqmul_qs8(acc12_qs8, alpha_qs8, fixed_point_position); + acc22_qs8 = vqmul_qs8(acc22_qs8, alpha_qs8, fixed_point_position); + acc32_qs8 = vqmul_qs8(acc32_qs8, alpha_qs8, fixed_point_position); + acc03_qs8 = vqmul_qs8(acc03_qs8, alpha_qs8, fixed_point_position); + acc13_qs8 = vqmul_qs8(acc13_qs8, alpha_qs8, fixed_point_position); + acc23_qs8 = vqmul_qs8(acc23_qs8, alpha_qs8, fixed_point_position); + acc33_qs8 = vqmul_qs8(acc33_qs8, alpha_qs8, fixed_point_position); + } + + const auto mtx_out0 = reinterpret_cast(out.ptr()); + + // Store 32x4 output elements + vst1_qs8(mtx_out0 + 0, acc00_qs8); + vst1_qs8(mtx_out0 + 8, acc01_qs8); + vst1_qs8(mtx_out0 + 16, acc02_qs8); + vst1_qs8(mtx_out0 + 24, acc03_qs8); + vst1_qs8(mtx_out0 + out_stride1 + 0, acc10_qs8); + vst1_qs8(mtx_out0 + out_stride1 + 8, acc11_qs8); + vst1_qs8(mtx_out0 + out_stride1 + 16, acc12_qs8); + vst1_qs8(mtx_out0 + out_stride1 + 24, acc13_qs8); + vst1_qs8(mtx_out0 + out_stride2 + 0, acc20_qs8); + vst1_qs8(mtx_out0 + out_stride2 + 8, acc21_qs8); + vst1_qs8(mtx_out0 + out_stride2 + 16, acc22_qs8); + vst1_qs8(mtx_out0 + out_stride2 + 24, acc23_qs8); + vst1_qs8(mtx_out0 + out_stride3 + 0, acc30_qs8); + vst1_qs8(mtx_out0 + out_stride3 + 8, acc31_qs8); + vst1_qs8(mtx_out0 + out_stride3 + 16, acc32_qs8); + vst1_qs8(mtx_out0 + out_stride3 + 24, acc33_qs8); + }, + ina, inb, out); +} + +} // namespace + +NEGEMMMatrixMultiplyKernel::NEGEMMMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr), _alpha(1.0f) +{ +} + +void NEGEMMMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F16, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F16, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); + + if(output->info()->dimension(1) == 1) + { + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + } + + _input0 = input0; + _input1 = input1; + _output = output; + _alpha = alpha; + + unsigned int num_elems_processed_per_iteration_x = 0; + const unsigned int num_elems_processed_per_iteration_y = 4; + + // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication + if((output->info()->dimension(1) == 1)) + { + switch(input0->info()->data_type()) + { + case DataType::F32: + { + num_elems_processed_per_iteration_x = 16; + break; + } + case DataType::QS8: + { + num_elems_processed_per_iteration_x = 32; + break; + } + default: + { + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x); + + update_window_and_padding(win, + AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x), + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x), + output_access); + + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); + + INEKernel::configure(win); + } + else + { + switch(input0->info()->data_type()) + { + case DataType::F32: + { + num_elems_processed_per_iteration_x = 8; + break; + } + case DataType::QS8: + { + num_elems_processed_per_iteration_x = 32; + break; + } + case DataType::F16: + { +#ifdef ARM_COMPUTE_ENABLE_FP16 + num_elems_processed_per_iteration_x = 8; + break; +#endif + } + default: + { + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, + AccessWindowRectangle(input0->info(), 0, 0, 4, 1, 1.f, 0.25f), + AccessWindowTranspose(input1->info(), 0, 0, 4, 1, 0.f, 0.25f), + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + INEKernel::configure(win); + } +} + +void NEGEMMMatrixMultiplyKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + bool multiply_alpha = std::abs(1.0f - _alpha) > 0.00001f; + + // Check if the output tensor is a vector and the data type is F32. If so,the kernel runs the vector-matrix multiplication + if((_output->info()->dimension(1) == 1)) + { + switch(_input0->info()->data_type()) + { + case DataType::F32: + { + multiply_alpha ? vector_matrix_multiply_f32(_input0, _input1, _output, window, _alpha) : + vector_matrix_multiply_f32(_input0, _input1, _output, window, _alpha); + break; + } + case DataType::QS8: + { + multiply_alpha ? vector_matrix_multiply_qs8(_input0, _input1, _output, window, _alpha) : + vector_matrix_multiply_qs8(_input0, _input1, _output, window, _alpha); + break; + } + default: + { + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + } + else + { + switch(_input0->info()->data_type()) + { + case DataType::F32: + { + multiply_alpha ? matrix_matrix_multiply_f32(_input0, _input1, _output, window, _alpha) : + matrix_matrix_multiply_f32(_input0, _input1, _output, window, _alpha); + break; + } + case DataType::QS8: + { + multiply_alpha ? matrix_matrix_multiply_qs8(_input0, _input1, _output, window, _alpha) : + matrix_matrix_multiply_qs8(_input0, _input1, _output, window, _alpha); + break; + } + case DataType::F16: + { +#ifdef ARM_COMPUTE_ENABLE_FP16 + multiply_alpha ? matrix_matrix_multiply_f16(_input0, _input1, _output, window, _alpha) : + matrix_matrix_multiply_f16(_input0, _input1, _output, window, _alpha); + break; +#endif + } + default: + { + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + } +} diff --git a/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp new file mode 100644 index 0000000000..ccf5cb4de3 --- /dev/null +++ b/src/core/NEON/kernels/NEGEMMTranspose1xWKernel.cpp @@ -0,0 +1,150 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +void NEGEMMTranspose1xWKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::U8, DataType::S8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t transpose_w = 16 / input->info()->element_size(); + output_shape.set(0, input->info()->dimension(1) * transpose_w); + output_shape.set(1, static_cast(std::ceil((input->info()->dimension(0) / static_cast(transpose_w))))); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + const unsigned int num_elems_processed_per_iteration = 16 / input->info()->element_size(); + const float scale_x = num_elems_processed_per_iteration; + + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, 1, scale_x, 1.f / scale_x); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEGEMMTranspose1xWKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + /* + * Following an example of how the transposition1xW works when the input data type is F32 + * + * |a00 a01 a02 a03| + * |a10 a11 a12 a13| + * |a20 a21 a22 a23| = | a00 a01 a02 a03 || a10 a11 a12 a13 || a20 a21 a22 a23 || a30 a31 a32 a33 | + * |a30 a31 a32 a33| + * + * The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + */ + + // Set window for output tensor. Set to 0 the X and Y dimensions in order to allow multi-threading implementation and future batched matrix multiplications + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, window); + Iterator out(_output, win_out); + + switch(_input->info()->element_size()) + { + case 1: + { + const size_t out_stride = _output->info()->strides_in_bytes()[1]; + execute_window_loop(window, [&](const Coordinates & id) + { + // Output address = base addr + (y * 16) + (x / 16 ) * stride + const uint8_t *in_ptr = in.ptr(); + uint8_t *const out_ptr = out.ptr() + (id.y() << 4) + (id.x() >> 4) * out_stride; + vst1q_u8(out_ptr, vld1q_u8(in_ptr)); + }, + in, out); + break; + } + case 2: + { + const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(int16_t); + execute_window_loop(window, [&](const Coordinates & id) + { + // Output address = base addr + (y * 8) + (x / 8 ) * stride + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()) + (id.y() << 3) + (id.x() >> 3) * out_stride; + vst1q_u16(out_ptr, vld1q_u16(in_ptr)); + }, + in, out); + break; + } + case 4: + { + const size_t out_stride = _output->info()->strides_in_bytes()[1] / sizeof(float); + execute_window_loop(window, [&](const Coordinates & id) + { + // Output address = base addr + (y * 4) + (x / 4 ) * stride + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()) + (id.y() << 2) + (id.x() >> 2) * out_stride; + vst1q_u32(out_ptr, vld1q_u32(in_ptr)); + }, + in, out); + break; + } + default: + { + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } + } +} diff --git a/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp new file mode 100644 index 0000000000..419f4825ef --- /dev/null +++ b/src/core/NEON/kernels/NEGaussian3x3Kernel.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +BorderSize NEGaussian3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void NEGaussian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEGaussian3x3Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator input(_input, window); + Iterator output(_output, window); + + const uint8_t *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1)); + const uint8_t *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0)); + const uint8_t *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1)); + + static const int16x8_t two = vdupq_n_s16(2); + static const int16x8_t four = vdupq_n_s16(4); + + execute_window_loop(window, [&](const Coordinates & id) + { + uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //top left + int16x8_t out = top_s16.val[0]; + //top mid + out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), two); + //top right + out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //mid left + out = vmlaq_s16(out, mid_s16.val[0], two); + //mid mid + out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 1), four); + //mid right + out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two); + //bot left + out = vaddq_s16(out, bot_s16.val[0]); + //bot mid + out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two); + //bot right + out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1_u8(output.ptr(), vqshrun_n_s16(out, 4)); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp new file mode 100644 index 0000000000..f872cc2f0a --- /dev/null +++ b/src/core/NEON/kernels/NEGaussian5x5Kernel.cpp @@ -0,0 +1,203 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +NEGaussian5x5HorKernel::NEGaussian5x5HorKernel() + : _border_size(0) +{ +} + +BorderSize NEGaussian5x5HorKernel::border_size() const +{ + return _border_size; +} + +void NEGaussian5x5HorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16); + + _input = input; + _output = output; + _border_size = BorderSize(border_undefined ? 0 : 2, 2); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEGaussian5x5HorKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window win_in(window); + win_in.shift(Window::DimX, -2); + + Iterator input(_input, win_in); + Iterator output(_output, window); + + static const int16x8_t six = vdupq_n_s16(6); + static const int16x8_t four = vdupq_n_s16(4); + + execute_window_loop(window, [&](const Coordinates & id) + { + uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + int16x8_t out = vaddq_s16(data_s16.val[0], vextq_s16(data_s16.val[0], data_s16.val[1], 4)); + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four); + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six); + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four); + + vst1q_s16(reinterpret_cast(output.ptr()), out); + }, + input, output); +} + +BorderSize NEGaussian5x5VertKernel::border_size() const +{ + return BorderSize(2, 0); +} + +void NEGaussian5x5VertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::S16); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output, Format::U8); + + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_read_per_iteration = 32; + constexpr unsigned int num_elems_written_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 5; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEGaussian5x5VertKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator input(_input, window); + Iterator output(_output, window); + + const uint8_t *input_top2_ptr = _input->ptr_to_element(Coordinates(0, -2)); + const uint8_t *input_top_ptr = _input->ptr_to_element(Coordinates(0, -1)); + const uint8_t *input_mid_ptr = _input->ptr_to_element(Coordinates(0, 0)); + const uint8_t *input_low_ptr = _input->ptr_to_element(Coordinates(0, 1)); + const uint8_t *input_low2_ptr = _input->ptr_to_element(Coordinates(0, 2)); + + const uint16x8_t six = vdupq_n_u16(6); + const uint16x8_t four = vdupq_n_u16(4); + + execute_window_loop(window, [&](const Coordinates & id) + { + const size_t input_offset_high_s16 = input.offset(); + const size_t input_offset_low_s16 = input.offset() + 16; + + //HIGH DATA + //top2 + uint16x8_t data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top2_ptr + input_offset_high_s16))); + uint16x8_t out_high = data_high; + //top + data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top_ptr + input_offset_high_s16))); + out_high = vmlaq_u16(out_high, data_high, four); + //mid + data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_mid_ptr + input_offset_high_s16))); + out_high = vmlaq_u16(out_high, data_high, six); + //low + data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low_ptr + input_offset_high_s16))); + out_high = vmlaq_u16(out_high, data_high, four); + //low2 + data_high = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low2_ptr + input_offset_high_s16))); + out_high = vaddq_u16(out_high, data_high); + + //LOW DATA + //top2 + uint16x8_t data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top2_ptr + input_offset_low_s16))); + uint16x8_t out_low = data_low; + //top + data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top_ptr + input_offset_low_s16))); + out_low = vmlaq_u16(out_low, data_low, four); + //mid + data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_mid_ptr + input_offset_low_s16))); + out_low = vmlaq_u16(out_low, data_low, six); + //low + data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low_ptr + input_offset_low_s16))); + out_low = vmlaq_u16(out_low, data_low, four); + //low2 + data_low = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low2_ptr + input_offset_low_s16))); + out_low = vaddq_u16(out_low, data_low); + + vst1q_u8(output.ptr(), vcombine_u8(vqshrn_n_u16(out_high, 8), + vqshrn_n_u16(out_low, 8))); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp new file mode 100644 index 0000000000..52d1fbf028 --- /dev/null +++ b/src/core/NEON/kernels/NEGaussianPyramidKernel.cpp @@ -0,0 +1,279 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +NEGaussianPyramidHorKernel::NEGaussianPyramidHorKernel() + : _border_size(0), _l2_load_offset(0) +{ +} + +BorderSize NEGaussianPyramidHorKernel::border_size() const +{ + return _border_size; +} + +void NEGaussianPyramidHorKernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != 2 * output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != output->info()->dimension(1)); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); + } + + _input = input; + _output = output; + _border_size = BorderSize(border_undefined ? 0 : 2, 2); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_read_per_iteration = 32; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr float scale_x = 0.5f; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration, scale_x); + + // Sub sampling selects odd pixels (1, 3, 5, ...) for images with even + // width and even pixels (0, 2, 4, ...) for images with odd width. (Whether + // a pixel is even or odd is determined based on the tensor shape not the + // valid region!) + // Thus the offset from which the first pixel (L2) for the convolution is + // loaded depends on the anchor and shape of the valid region. + // In the case of an even shape (= even image width) we need to load L2 + // from -2 if the anchor is odd and from -1 if the anchor is even. That + // makes sure that L2 is always loaded from an odd pixel. + // On the other hand, for an odd shape (= odd image width) we need to load + // L2 from -1 if the anchor is odd and from -2 if the anchor is even to + // achieve the opposite effect. + // The condition can be simplified to checking whether anchor + shape is + // odd (-2) or even (-1) as only adding an odd and an even number will have + // an odd result. + _l2_load_offset = -border_size().left; + + if((_input->info()->valid_region().anchor[0] + _input->info()->valid_region().shape[0]) % 2 == 0) + { + _l2_load_offset += 1; + } + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), _l2_load_offset, num_elems_read_per_iteration), + output_access); + + ValidRegion valid_region = input->info()->valid_region(); + valid_region.anchor.set(0, std::ceil((valid_region.anchor[0] + (border_undefined ? border_size().left : 0)) / 2.f)); + valid_region.shape.set(0, (valid_region.shape[0] - (border_undefined ? border_size().right : 0)) / 2 - valid_region.anchor[0]); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEGaussianPyramidHorKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(window.x().step() % 2); + + static const int16x8_t six = vdupq_n_s16(6); + static const int16x8_t four = vdupq_n_s16(4); + + Window win_in(window); + win_in.shift(Window::DimX, _l2_load_offset); + + Iterator in(_input, win_in); + + // The output is half the width of the input + Window win_out(window); + win_out.scale(Window::DimX, 0.5f); + + Iterator out(_output, win_out); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16x2_t data_2q = vld2q_u8(in.ptr()); + const uint8x16_t &data_even = data_2q.val[0]; + const uint8x16_t &data_odd = data_2q.val[1]; + + const int16x8_t data_l2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_even))); + const int16x8_t data_l1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data_odd))); + const int16x8_t data_m = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 1)))); + const int16x8_t data_r1 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_odd, data_odd, 1)))); + const int16x8_t data_r2 = vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(vextq_u8(data_even, data_even, 2)))); + + int16x8_t out_val = vaddq_s16(data_l2, data_r2); + out_val = vmlaq_s16(out_val, data_l1, four); + out_val = vmlaq_s16(out_val, data_m, six); + out_val = vmlaq_s16(out_val, data_r1, four); + + vst1q_s16(reinterpret_cast(out.ptr()), out_val); + }, + in, out); +} + +NEGaussianPyramidVertKernel::NEGaussianPyramidVertKernel() + : _t2_load_offset(0) +{ +} + +BorderSize NEGaussianPyramidVertKernel::border_size() const +{ + return BorderSize(2, 0); +} + +void NEGaussianPyramidVertKernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != output->info()->dimension(0)); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(1) != 2 * output->info()->dimension(1)); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); + } + + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_rows_processed_per_iteration = 2; + + constexpr unsigned int num_elems_written_per_iteration = 16; + constexpr unsigned int num_rows_written_per_iteration = 1; + + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 5; + + constexpr float scale_y = 0.5f; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_rows_processed_per_iteration), border_undefined, border_size()); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration, 1.f, scale_y); + + // Determine whether we need to load even or odd rows. See above for a + // detailed explanation. + _t2_load_offset = -border_size().top; + + if((_input->info()->valid_region().anchor[1] + _input->info()->valid_region().shape[1]) % 2 == 0) + { + _t2_load_offset += 1; + } + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, _t2_load_offset, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + ValidRegion valid_region = input->info()->valid_region(); + valid_region.anchor.set(1, std::ceil((valid_region.anchor[1] + (border_undefined ? border_size().top : 0)) / 2.f)); + valid_region.shape.set(1, (valid_region.shape[1] - (border_undefined ? border_size().bottom : 0)) / 2 - valid_region.anchor[1]); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEGaussianPyramidVertKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(window.x().step() != 16); + ARM_COMPUTE_ERROR_ON(window.y().step() % 2); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + static const uint16x8_t six = vdupq_n_u16(6); + static const uint16x8_t four = vdupq_n_u16(4); + + Window win_in(window); + // Need to load two times 8 values instead of 16 values once + win_in.set_dimension_step(Window::DimX, 8); + win_in.shift(Window::DimY, _t2_load_offset); + + Iterator in(_input, win_in); + + // Output's height is half of input's + Window win_out(window); + win_out.scale(Window::DimY, 0.5f); + + Iterator out(_output, win_out); + + const uint8_t *input_top2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 0)); + const uint8_t *input_top_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 1)); + const uint8_t *input_mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 2)); + const uint8_t *input_low_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 3)); + const uint8_t *input_low2_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, 4)); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Low data + const uint16x8_t data_low_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top2_ptr + in.offset()))); + const uint16x8_t data_low_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top_ptr + in.offset()))); + const uint16x8_t data_low_m = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_mid_ptr + in.offset()))); + const uint16x8_t data_low_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low_ptr + in.offset()))); + const uint16x8_t data_low_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low2_ptr + in.offset()))); + + uint16x8_t out_low = vaddq_u16(data_low_t2, data_low_b2); + out_low = vmlaq_u16(out_low, data_low_t1, four); + out_low = vmlaq_u16(out_low, data_low_m, six); + out_low = vmlaq_u16(out_low, data_low_b1, four); + + in.increment(Window::DimX); + + // High data + const uint16x8_t data_high_t2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top2_ptr + in.offset()))); + const uint16x8_t data_high_t1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_top_ptr + in.offset()))); + const uint16x8_t data_high_m = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_mid_ptr + in.offset()))); + const uint16x8_t data_high_b1 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low_ptr + in.offset()))); + const uint16x8_t data_high_b2 = vreinterpretq_u16_s16(vld1q_s16(reinterpret_cast(input_low2_ptr + in.offset()))); + + uint16x8_t out_high = vaddq_u16(data_high_t2, data_high_b2); + out_high = vmlaq_u16(out_high, data_high_t1, four); + out_high = vmlaq_u16(out_high, data_high_m, six); + out_high = vmlaq_u16(out_high, data_high_b1, four); + + vst1q_u8(out.ptr(), vcombine_u8(vqshrn_n_u16(out_low, 8), vqshrn_n_u16(out_high, 8))); + }, + in, out); +} diff --git a/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp new file mode 100644 index 0000000000..404ad8a388 --- /dev/null +++ b/src/core/NEON/kernels/NEHOGDescriptorKernel.cpp @@ -0,0 +1,802 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/HOGInfo.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace +{ +void cell_width_lt8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, + size_t mag_stride, size_t phase_stride, size_t cell_width, size_t cell_height, size_t num_bins, float phase_scale) +{ + const float32x4_t scale_f32 = vdupq_n_f32(phase_scale); + static const float32x4_t one_f32 = vdupq_n_f32(1.0f); + static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f); + static const int32x4_t zero_s32 = vdupq_n_s32(0); + static const int32x4_t one_s32 = vdupq_n_s32(1); + const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins); + + memset(output_ptr, 0, sizeof(float) * num_bins); + + for(size_t yc = 0; yc < cell_height; ++yc) + { + int32_t xc = 0; + + for(; xc <= static_cast(cell_width) - 4; xc += 4) + { + // Load magnitude and phase values + const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride); + const int16x4_t mag_s16 = vld1_s16(mag_row_ptr + xc + yc * mag_stride); + + // Convert magnitude and phase to float + const float32x4_t mag_f32 = vcvtq_f32_s32(vmovl_s16(mag_s16)); + float32x4_t phase_f32 = vcvtq_f32_u32(vmovl_u16(vget_low_u16(vmovl_u8(phase_u8)))); + + // Scale phase: phase * scale + 0.5f + phase_f32 = vmlaq_f32(zerofive_f32, phase_f32, scale_f32); + + // Compute histogram index. + int32x4_t hidx_s32 = vcvtq_s32_f32(phase_f32); + + // Compute magnitude weights (w0 and w1) + const float32x4_t hidx_f32 = vcvtq_f32_s32(hidx_s32); + + // w1 = phase_f32 - hidx_f32 + const float32x4_t w1_f32 = vsubq_f32(phase_f32, hidx_f32); + + // w0 = 1.0 - w1 + const float32x4_t w0_f32 = vsubq_f32(one_f32, w1_f32); + + // Compute contribute for splitting vote + const float32x4_t mag_w0_f32 = vmulq_f32(mag_f32, w0_f32); + const float32x4_t mag_w1_f32 = vmulq_f32(mag_f32, w1_f32); + + // Weighted vote between 2 bins + + // Check if the histogram index is equal to num_bins. If so, replace the index with 0 + uint32x4_t mask = vceqq_s32(hidx_s32, num_bins_s32); + hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32); + + // Bin 0 + *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w0_f32, 0); + *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w0_f32, 1); + *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w0_f32, 2); + *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w0_f32, 3); + + hidx_s32 = vaddq_s32(hidx_s32, one_s32); + + // Check if the histogram index is equal to num_bins + mask = vceqq_s32(hidx_s32, num_bins_s32); + hidx_s32 = vbslq_s32(mask, zero_s32, hidx_s32); + + // Bin1 + *(output_ptr + vgetq_lane_s32(hidx_s32, 0)) += vgetq_lane_f32(mag_w1_f32, 0); + *(output_ptr + vgetq_lane_s32(hidx_s32, 1)) += vgetq_lane_f32(mag_w1_f32, 1); + *(output_ptr + vgetq_lane_s32(hidx_s32, 2)) += vgetq_lane_f32(mag_w1_f32, 2); + *(output_ptr + vgetq_lane_s32(hidx_s32, 3)) += vgetq_lane_f32(mag_w1_f32, 3); + } + + for(; xc < static_cast(cell_width); ++xc) + { + const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f; + const float mag_value = *(mag_row_ptr + xc + yc * mag_stride); + + const float w1 = phase_value - std::floor(phase_value); + + // The quantised phase is the histogram index [0, num_bins - 1] - Round + // Check limit of histogram index. If hidx == num_bins, hidx = 0 + const auto hidx = static_cast(phase_value) % num_bins; + + // Weighted vote between 2 bins + *(output_ptr + hidx) += mag_value * (1.0f - w1); + *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1; + } + } +} + +void cell_width_ge8(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width, + size_t cell_height, size_t num_bins, float phase_scale) +{ + const float32x4_t scale_f32 = vdupq_n_f32(phase_scale); + static const float32x4_t one_f32 = vdupq_n_f32(1.0f); + static const float32x4_t zerofive_f32 = vdupq_n_f32(0.5f); + static const int32x4_t zero_s32 = vdupq_n_s32(0); + static const int32x4_t one_s32 = vdupq_n_s32(1); + const int32x4_t num_bins_s32 = vdupq_n_s32(num_bins); + + memset(output_ptr, 0, sizeof(float) * num_bins); + + for(size_t yc = 0; yc < cell_height; ++yc) + { + int32_t xc = 0; + + for(; xc <= static_cast(cell_width) - 8; xc += 8) + { + // Load magnitude and phase values + const uint8x8_t phase_u8 = vld1_u8(phase_row_ptr + xc + yc * phase_stride); + const int16x8_t mag_s16 = vld1q_s16(mag_row_ptr + xc + yc * mag_stride); + + // Convert phase to U16 + const uint16x8_t phase_u16 = vmovl_u8(phase_u8); + + // Convert magnitude to float32 + const float32x4x2_t mag_f32 = + { + { + vcvtq_f32_s32(vmovl_s16(vget_low_s16(mag_s16))), + vcvtq_f32_s32(vmovl_s16(vget_high_s16(mag_s16))) + } + }; + + // Convert phase to float32 + float32x4x2_t phase_f32 = + { + { + vcvtq_f32_u32(vmovl_u16(vget_low_u16(phase_u16))), + vcvtq_f32_u32(vmovl_u16(vget_high_u16(phase_u16))) + } + }; + + // Scale phase: phase * scale + 0.5f + phase_f32.val[0] = vmlaq_f32(zerofive_f32, phase_f32.val[0], scale_f32); + phase_f32.val[1] = vmlaq_f32(zerofive_f32, phase_f32.val[1], scale_f32); + + // Compute histogram index. + int32x4x2_t hidx_s32 = + { + { + vcvtq_s32_f32(phase_f32.val[0]), + vcvtq_s32_f32(phase_f32.val[1]) + } + }; + + // Compute magnitude weights (w0 and w1) + const float32x4x2_t hidx_f32 = + { + { + vcvtq_f32_s32(hidx_s32.val[0]), + vcvtq_f32_s32(hidx_s32.val[1]) + } + }; + + float32x4x2_t w1_f32 = + { + { + vsubq_f32(phase_f32.val[0], hidx_f32.val[0]), + vsubq_f32(phase_f32.val[1], hidx_f32.val[1]) + } + }; + + float32x4x2_t w0_f32 = + { + { + vsubq_f32(one_f32, w1_f32.val[0]), + vsubq_f32(one_f32, w1_f32.val[1]) + } + }; + + // Compute contribute for splitting vote + const float32x4x2_t mag_w0_f32 = + { + { + vmulq_f32(mag_f32.val[0], w0_f32.val[0]), + vmulq_f32(mag_f32.val[1], w0_f32.val[1]) + } + }; + + const float32x4x2_t mag_w1_f32 = + { + { + vmulq_f32(mag_f32.val[0], w1_f32.val[0]), + vmulq_f32(mag_f32.val[1], w1_f32.val[1]) + } + }; + + // Weighted vote between 2 bins + + // Check if the histogram index is equal to num_bins + uint32x4x2_t mask = + { + { + vceqq_s32(hidx_s32.val[0], num_bins_s32), + vceqq_s32(hidx_s32.val[1], num_bins_s32) + } + }; + + hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]); + hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]); + + // First bin - Low + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w0_f32.val[0], 0); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w0_f32.val[0], 1); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w0_f32.val[0], 2); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w0_f32.val[0], 3); + + // First bin - high + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w0_f32.val[1], 0); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w0_f32.val[1], 1); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w0_f32.val[1], 2); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w0_f32.val[1], 3); + + hidx_s32.val[0] = vaddq_s32(hidx_s32.val[0], one_s32); + hidx_s32.val[1] = vaddq_s32(hidx_s32.val[1], one_s32); + + // Check if the histogram index is equal to num_bins + mask.val[0] = vceqq_s32(hidx_s32.val[0], num_bins_s32); + mask.val[1] = vceqq_s32(hidx_s32.val[1], num_bins_s32); + + hidx_s32.val[0] = vbslq_s32(mask.val[0], zero_s32, hidx_s32.val[0]); + hidx_s32.val[1] = vbslq_s32(mask.val[1], zero_s32, hidx_s32.val[1]); + + // Second bin - Low + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 0)) += vgetq_lane_f32(mag_w1_f32.val[0], 0); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 1)) += vgetq_lane_f32(mag_w1_f32.val[0], 1); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 2)) += vgetq_lane_f32(mag_w1_f32.val[0], 2); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[0], 3)) += vgetq_lane_f32(mag_w1_f32.val[0], 3); + + // Second bin - high + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 0)) += vgetq_lane_f32(mag_w1_f32.val[1], 0); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 1)) += vgetq_lane_f32(mag_w1_f32.val[1], 1); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 2)) += vgetq_lane_f32(mag_w1_f32.val[1], 2); + *(output_ptr + vgetq_lane_s32(hidx_s32.val[1], 3)) += vgetq_lane_f32(mag_w1_f32.val[1], 3); + } + + for(; xc < static_cast(cell_width); xc++) + { + const float phase_value = *(phase_row_ptr + xc + yc * phase_stride) * phase_scale + 0.5f; + const float mag_value = *(mag_row_ptr + xc + yc * mag_stride); + + const float w1 = phase_value - std::floor(phase_value); + + // The quantised phase is the histogram index [0, num_bins - 1] - Round + // Check limit of histogram index. If hidx == num_bins, hidx = 0 + const size_t hidx = static_cast(phase_value) % num_bins; + + // Weighted vote between 2 bins + *(output_ptr + hidx) += mag_value * (1.0f - w1); + *(output_ptr + ((hidx + 1) % (num_bins))) += mag_value * w1; + } + } +} + +void l2_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, + size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, float l2_hyst_threshold) +{ + ARM_COMPUTE_UNUSED(l2_hyst_threshold); + + float sum = 0.0f; + float32x4_t sum_f32 = vdupq_n_f32(0.0f); + + // Compute L2-Norm + for(size_t yc = 0; yc < num_cells_per_block_height; ++yc) + { + const float *const hist_ptr = input_row_ptr + yc * input_stride; + + int32_t xc = 0; + + for(; xc <= static_cast(num_bins_block_x) - 16; xc += 16) + { + const float32x4x4_t input_value = + { + { + vld1q_f32(hist_ptr + xc + 0), + vld1q_f32(hist_ptr + xc + 4), + vld1q_f32(hist_ptr + xc + 8), + vld1q_f32(hist_ptr + xc + 12) + } + }; + + // Compute input_value^2 + sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]); + + vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]); + vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]); + vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]); + vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]); + } + + // Compute left over + for(; xc < static_cast(num_bins_block_x); xc++) + { + const float input_value = hist_ptr[xc]; + + sum += input_value * input_value; + + output_ptr[xc + yc * num_bins_block_x] = input_value; + } + } + + sum += vgetq_lane_f32(sum_f32, 0); + sum += vgetq_lane_f32(sum_f32, 1); + sum += vgetq_lane_f32(sum_f32, 2); + sum += vgetq_lane_f32(sum_f32, 3); + + const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f); + const float32x4_t scale_f32 = vdupq_n_f32(scale); + + int32_t i = 0; + + for(; i <= static_cast(num_bins_block) - 16; i += 16) + { + float32x4x4_t input_value = + { + { + vld1q_f32(&output_ptr[i + 0]), + vld1q_f32(&output_ptr[i + 4]), + vld1q_f32(&output_ptr[i + 8]), + vld1q_f32(&output_ptr[i + 12]) + } + }; + + // Scale input_value + input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32); + input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32); + input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32); + input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32); + + vst1q_f32(&output_ptr[i + 0], input_value.val[0]); + vst1q_f32(&output_ptr[i + 4], input_value.val[1]); + vst1q_f32(&output_ptr[i + 8], input_value.val[2]); + vst1q_f32(&output_ptr[i + 12], input_value.val[3]); + } + + for(; i < static_cast(num_bins_block); ++i) + { + output_ptr[i] *= scale; + } +} + +void l2hys_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, + float l2_hyst_threshold) +{ + float sum = 0.0f; + float32x4_t sum_f32 = vdupq_n_f32(0.0f); + + // Compute L2-Hys + for(size_t yc = 0; yc < num_cells_per_block_height; ++yc) + { + const float *const hist_ptr = input_row_ptr + yc * input_stride; + + int32_t xc = 0; + + for(; xc <= static_cast(num_bins_block_x) - 16; xc += 16) + { + const float32x4x4_t input_value = + { + { + vld1q_f32(hist_ptr + xc + 0), + vld1q_f32(hist_ptr + xc + 4), + vld1q_f32(hist_ptr + xc + 8), + vld1q_f32(hist_ptr + xc + 12) + } + }; + + // Compute input_value^2 + sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]); + + vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]); + vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]); + vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]); + vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]); + } + + // Compute left over + for(; xc < static_cast(num_bins_block_x); ++xc) + { + const float input_value = hist_ptr[xc]; + + sum += input_value * input_value; + + output_ptr[xc + yc * num_bins_block_x] = input_value; + } + } + + sum += vgetq_lane_f32(sum_f32, 0); + sum += vgetq_lane_f32(sum_f32, 1); + sum += vgetq_lane_f32(sum_f32, 2); + sum += vgetq_lane_f32(sum_f32, 3); + + float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f); + float32x4_t scale_f32 = vdupq_n_f32(scale); + const float32x4_t l2_hyst_threshold_f32 = vdupq_n_f32(l2_hyst_threshold); + + // Reset sum + sum_f32 = vdupq_n_f32(0.0f); + sum = 0.0f; + + int32_t i = 0; + + for(; i <= static_cast(num_bins_block) - 16; i += 16) + { + float32x4x4_t input_value = + { + { + vld1q_f32(&output_ptr[i + 0]), + vld1q_f32(&output_ptr[i + 4]), + vld1q_f32(&output_ptr[i + 8]), + vld1q_f32(&output_ptr[i + 12]) + } + }; + + // Scale input_value + input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32); + input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32); + input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32); + input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32); + + // Clip input_value if over _threshold_l2hys + input_value.val[0] = vminq_f32(input_value.val[0], l2_hyst_threshold_f32); + input_value.val[1] = vminq_f32(input_value.val[1], l2_hyst_threshold_f32); + input_value.val[2] = vminq_f32(input_value.val[2], l2_hyst_threshold_f32); + input_value.val[3] = vminq_f32(input_value.val[3], l2_hyst_threshold_f32); + + // Compute input_value^2 + sum_f32 = vmlaq_f32(sum_f32, input_value.val[0], input_value.val[0]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[1], input_value.val[1]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[2], input_value.val[2]); + sum_f32 = vmlaq_f32(sum_f32, input_value.val[3], input_value.val[3]); + + vst1q_f32(&output_ptr[i + 0], input_value.val[0]); + vst1q_f32(&output_ptr[i + 4], input_value.val[1]); + vst1q_f32(&output_ptr[i + 8], input_value.val[2]); + vst1q_f32(&output_ptr[i + 12], input_value.val[3]); + } + + sum += vgetq_lane_f32(sum_f32, 0); + sum += vgetq_lane_f32(sum_f32, 1); + sum += vgetq_lane_f32(sum_f32, 2); + sum += vgetq_lane_f32(sum_f32, 3); + + for(; i < static_cast(num_bins_block); ++i) + { + float input_value = output_ptr[i] * scale; + + // Clip scaled input_value if over _threshold_L2hys + input_value = std::min(input_value, l2_hyst_threshold); + + sum += input_value * input_value; + + output_ptr[i] = input_value; + } + + // We use the same constants of OpenCV + scale = 1.0f / (std::sqrt(sum) + 1e-3f); + scale_f32 = vdupq_n_f32(scale); + + // Rescale + i = 0; + + for(; i <= static_cast(num_bins_block) - 16; i += 16) + { + float32x4x4_t input_value = + { + { + vld1q_f32(&output_ptr[i + 0]), + vld1q_f32(&output_ptr[i + 4]), + vld1q_f32(&output_ptr[i + 8]), + vld1q_f32(&output_ptr[i + 12]) + } + }; + + // Scale input_value + input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32); + input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32); + input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32); + input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32); + + vst1q_f32(&output_ptr[i + 0], input_value.val[0]); + vst1q_f32(&output_ptr[i + 4], input_value.val[1]); + vst1q_f32(&output_ptr[i + 8], input_value.val[2]); + vst1q_f32(&output_ptr[i + 12], input_value.val[3]); + } + + for(; i < static_cast(num_bins_block); ++i) + { + // Store result + output_ptr[i] *= scale; + } +} + +void l1_norm(const float *__restrict input_row_ptr, float *__restrict output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, + float l2_hyst_threshold) +{ + ARM_COMPUTE_UNUSED(l2_hyst_threshold); + + float sum = 0.0f; + float32x4_t sum_f32 = vdupq_n_f32(0.0f); + + // Compute L1-Norm + for(size_t yc = 0; yc < num_cells_per_block_height; ++yc) + { + const float *const hist_ptr = input_row_ptr + yc * input_stride; + + int32_t xc = 0; + + for(; xc <= static_cast(num_bins_block_x) - 16; xc += 16) + { + const float32x4x4_t input_value = + { + { + vld1q_f32(hist_ptr + xc + 0), + vld1q_f32(hist_ptr + xc + 4), + vld1q_f32(hist_ptr + xc + 8), + vld1q_f32(hist_ptr + xc + 12) + } + }; + + // Compute |input_value| + sum_f32 += vabsq_f32(input_value.val[0]); + sum_f32 += vabsq_f32(input_value.val[1]); + sum_f32 += vabsq_f32(input_value.val[2]); + sum_f32 += vabsq_f32(input_value.val[3]); + + vst1q_f32(&output_ptr[xc + 0 + yc * num_bins_block_x], input_value.val[0]); + vst1q_f32(&output_ptr[xc + 4 + yc * num_bins_block_x], input_value.val[1]); + vst1q_f32(&output_ptr[xc + 8 + yc * num_bins_block_x], input_value.val[2]); + vst1q_f32(&output_ptr[xc + 12 + yc * num_bins_block_x], input_value.val[3]); + } + + for(; xc < static_cast(num_bins_block_x); xc++) + { + const float input_value = hist_ptr[xc]; + + sum += std::abs(input_value); + + output_ptr[xc + yc * num_bins_block_x] = input_value; + } + } + + sum += vgetq_lane_f32(sum_f32, 0); + sum += vgetq_lane_f32(sum_f32, 1); + sum += vgetq_lane_f32(sum_f32, 2); + sum += vgetq_lane_f32(sum_f32, 3); + + const float scale = 1.0f / (std::sqrt(sum) + num_bins_block * 0.1f); + const float32x4_t scale_f32 = vdupq_n_f32(scale); + + int32_t i = 0; + + for(; i <= static_cast(num_bins_block) - 16; i += 16) + { + float32x4x4_t input_value = + { + { + vld1q_f32(&output_ptr[i + 0]), + vld1q_f32(&output_ptr[i + 4]), + vld1q_f32(&output_ptr[i + 8]), + vld1q_f32(&output_ptr[i + 12]) + } + }; + + // Scale input_value + input_value.val[0] = vmulq_f32(input_value.val[0], scale_f32); + input_value.val[1] = vmulq_f32(input_value.val[1], scale_f32); + input_value.val[2] = vmulq_f32(input_value.val[2], scale_f32); + input_value.val[3] = vmulq_f32(input_value.val[3], scale_f32); + + vst1q_f32(&output_ptr[i + 0], input_value.val[0]); + vst1q_f32(&output_ptr[i + 4], input_value.val[1]); + vst1q_f32(&output_ptr[i + 8], input_value.val[2]); + vst1q_f32(&output_ptr[i + 12], input_value.val[3]); + } + + for(; i < static_cast(num_bins_block); ++i) + { + output_ptr[i] *= scale; + } +} +} // namespace + +NEHOGOrientationBinningKernel::NEHOGOrientationBinningKernel() + : _func(nullptr), _input_magnitude(nullptr), _input_phase(nullptr), _output(nullptr), _cell_width(0), _cell_height(0), _num_bins(0), _phase_scale(0) +{ +} + +void NEHOGOrientationBinningKernel::configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_magnitude, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_phase, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(hog_info == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, hog_info->num_bins(), DataType::F32); + ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimX) != input_phase->info()->dimension(Window::DimX)); + ARM_COMPUTE_ERROR_ON(input_magnitude->info()->dimension(Window::DimY) != input_phase->info()->dimension(Window::DimY)); + + _input_magnitude = input_magnitude; + _input_phase = input_phase; + _output = output; + _cell_width = hog_info->cell_size().width; + _cell_height = hog_info->cell_size().height; + _num_bins = hog_info->num_bins(); + _phase_scale = (PhaseType::SIGNED == hog_info->phase_type() ? _num_bins / 360.0f : _num_bins / 180.0f); + _phase_scale *= (PhaseType::SIGNED == hog_info->phase_type() ? 360.0f / 255.0f : 1.0f); + + if(_cell_width < 8) + { + _func = &cell_width_lt8; + } + else + { + _func = &cell_width_ge8; + } + + constexpr unsigned int num_elems_processed_per_iteration = 1; + const unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = _cell_height; + const unsigned int num_elems_written_per_iteration = 1; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input_magnitude->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + AccessWindowRectangle(input_phase->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEHOGOrientationBinningKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + const size_t mag_stride = _input_magnitude->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_magnitude->info()->format()); + const size_t phase_stride = _input_phase->info()->strides_in_bytes()[Window::DimY] / pixel_size_from_format(_input_phase->info()->format()); + + Window win_mag(window); + win_mag.set(Window::DimX, Window::Dimension(window.x().start() * _cell_width, window.x().start() * _cell_width, _cell_width)); + win_mag.set(Window::DimY, Window::Dimension(window.y().start() * _cell_height, window.y().start() * _cell_height, _cell_height)); + + Window win_phase(win_mag); + + Iterator mag(_input_magnitude, win_mag); + Iterator phase(_input_phase, win_phase); + Iterator out(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto mag_row_ptr = reinterpret_cast(mag.ptr()); + const auto phase_row_ptr = reinterpret_cast(phase.ptr()); + const auto out_row_ptr = reinterpret_cast(out.ptr()); + + (*_func)(mag_row_ptr, phase_row_ptr, out_row_ptr, mag_stride, phase_stride, _cell_width, _cell_height, _num_bins, _phase_scale); + }, + mag, phase, out); +} + +NEHOGBlockNormalizationKernel::NEHOGBlockNormalizationKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _num_cells_per_block(), _num_cells_per_block_stride(), _num_bins(0), _l2_hyst_threshold(0.0f) +{ +} + +void NEHOGBlockNormalizationKernel::configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info) +{ + ARM_COMPUTE_ERROR_ON(hog_info == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, hog_info->num_bins(), DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(output, DataType::F32); + + // Number of cells per block + const Size2D num_cells_per_block(hog_info->block_size().width / hog_info->cell_size().width, + hog_info->block_size().height / hog_info->cell_size().height); + + // Number of cells per block stride + const Size2D num_cells_per_block_stride(hog_info->block_stride().width / hog_info->cell_size().width, + hog_info->block_stride().height / hog_info->cell_size().height); + + _input = input; + _output = output; + _l2_hyst_threshold = hog_info->l2_hyst_threshold(); + _num_cells_per_block = num_cells_per_block; + _num_cells_per_block_stride = num_cells_per_block_stride; + _num_bins = hog_info->num_bins(); + + ARM_COMPUTE_ERROR_ON((output->info()->num_channels() != (_num_bins * num_cells_per_block.width * num_cells_per_block.height))); + + switch(hog_info->normalization_type()) + { + case HOGNormType::L2_NORM: + _func = &l2_norm; + break; + case HOGNormType::L2HYS_NORM: + _func = &l2hys_norm; + break; + case HOGNormType::L1_NORM: + _func = &l1_norm; + break; + default: + ARM_COMPUTE_ERROR_ON("Normalisation type not supported"); + break; + } + + constexpr unsigned int num_elems_processed_per_iteration = 1; + const unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = _num_cells_per_block.height; + const unsigned int num_elems_written_per_iteration = 1; + const unsigned int num_rows_written_per_iteration = _num_cells_per_block.height; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_written_per_iteration, num_rows_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NEHOGBlockNormalizationKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Get number of bins per block + const size_t num_bins_per_block = _output->info()->num_channels(); + + // Number of bins on the same row of the block + const int32_t num_bins_per_block_x = _num_cells_per_block.width * _num_bins; + + const size_t input_stride = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type()); + + Window win_in(window); + win_in.set_dimension_step(Window::DimX, _num_cells_per_block_stride.width); + win_in.set_dimension_step(Window::DimY, _num_cells_per_block_stride.height); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + // Normalises blocks + execute_window_loop(window, [&](const Coordinates & id) + { + const auto input_row_ptr = reinterpret_cast(in.ptr()); + const auto out_row_ptr = reinterpret_cast(out.ptr()); + + // Execute normalization function + (*_func)(input_row_ptr, out_row_ptr, input_stride, _num_cells_per_block.height, num_bins_per_block_x, num_bins_per_block, _l2_hyst_threshold); + }, + in, out); +} diff --git a/src/core/NEON/kernels/NEHOGDetectorKernel.cpp b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp new file mode 100644 index 0000000000..4af22bca75 --- /dev/null +++ b/src/core/NEON/kernels/NEHOGDetectorKernel.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/HOGInfo.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +NEHOGDetectorKernel::NEHOGDetectorKernel() + : _input(nullptr), _detection_windows(), _hog_descriptor(nullptr), _bias(0.0f), _threshold(0.0f), _idx_class(0), _num_bins_per_descriptor_x(0), _num_blocks_per_descriptor_y(0), _block_stride_width(0), + _block_stride_height(0), _detection_window_width(0), _detection_window_height(0), _max_num_detection_windows(0), _mutex() +{ +} + +void NEHOGDetectorKernel::configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold, uint16_t idx_class) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_NOT_IN(input, DataType::F32); + ARM_COMPUTE_ERROR_ON(hog == nullptr); + ARM_COMPUTE_ERROR_ON(detection_windows == nullptr); + ARM_COMPUTE_ERROR_ON((detection_window_stride.width % hog->info()->block_stride().width) != 0); + ARM_COMPUTE_ERROR_ON((detection_window_stride.height % hog->info()->block_stride().height) != 0); + + const Size2D &detection_window_size = hog->info()->detection_window_size(); + const Size2D &block_size = hog->info()->block_size(); + const Size2D &block_stride = hog->info()->block_stride(); + + _input = input; + _detection_windows = detection_windows; + _threshold = threshold; + _idx_class = idx_class; + _hog_descriptor = hog->descriptor(); + _bias = _hog_descriptor[hog->info()->descriptor_size() - 1]; + _num_bins_per_descriptor_x = ((detection_window_size.width - block_size.width) / block_stride.width + 1) * input->info()->num_channels(); + _num_blocks_per_descriptor_y = (detection_window_size.height - block_size.height) / block_stride.height + 1; + _block_stride_width = block_stride.width; + _block_stride_height = block_stride.height; + _detection_window_width = detection_window_size.width; + _detection_window_height = detection_window_size.height; + _max_num_detection_windows = detection_windows->max_num_values(); + + ARM_COMPUTE_ERROR_ON((_num_bins_per_descriptor_x * _num_blocks_per_descriptor_y + 1) != hog->info()->descriptor_size()); + + // Get the number of blocks along the x and y directions of the input tensor + const ValidRegion &valid_region = input->info()->valid_region(); + const size_t num_blocks_x = valid_region.shape[0]; + const size_t num_blocks_y = valid_region.shape[1]; + + // Get the number of blocks along the x and y directions of the detection window + const size_t num_blocks_per_detection_window_x = detection_window_size.width / block_stride.width; + const size_t num_blocks_per_detection_window_y = detection_window_size.height / block_stride.height; + + const size_t window_step_x = detection_window_stride.width / block_stride.width; + const size_t window_step_y = detection_window_stride.height / block_stride.height; + + // Configure kernel window + Window win; + win.set(Window::DimX, Window::Dimension(0, floor_to_multiple(num_blocks_x - num_blocks_per_detection_window_x, window_step_x), window_step_x)); + win.set(Window::DimY, Window::Dimension(0, floor_to_multiple(num_blocks_y - num_blocks_per_detection_window_y, window_step_y), window_step_y)); + + constexpr unsigned int num_elems_read_per_iteration = 1; + const unsigned int num_rows_read_per_iteration = _num_blocks_per_descriptor_y; + + update_window_and_padding(win, AccessWindowRectangle(input->info(), 0, 0, num_elems_read_per_iteration, num_rows_read_per_iteration)); + + INEKernel::configure(win); +} + +void NEHOGDetectorKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_hog_descriptor == nullptr); + + const size_t in_step_y = _input->info()->strides_in_bytes()[Window::DimY] / data_size_from_type(_input->info()->data_type()); + + Iterator in(_input, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto *in_row_ptr = reinterpret_cast(in.ptr()); + + // Init score_f32 with 0 + float32x4_t score_f32 = vdupq_n_f32(0.0f); + + // Init score with bias + float score = _bias; + + // Compute Linear SVM + for(size_t yb = 0; yb < _num_blocks_per_descriptor_y; ++yb, in_row_ptr += in_step_y) + { + int32_t xb = 0; + + const int32_t offset_y = yb * _num_bins_per_descriptor_x; + + for(; xb < static_cast(_num_bins_per_descriptor_x) - 16; xb += 16) + { + // Load descriptor values + const float32x4x4_t a_f32 = + { + { + vld1q_f32(&in_row_ptr[xb + 0]), + vld1q_f32(&in_row_ptr[xb + 4]), + vld1q_f32(&in_row_ptr[xb + 8]), + vld1q_f32(&in_row_ptr[xb + 12]) + } + }; + + // Load detector values + const float32x4x4_t b_f32 = + { + { + vld1q_f32(&_hog_descriptor[xb + 0 + offset_y]), + vld1q_f32(&_hog_descriptor[xb + 4 + offset_y]), + vld1q_f32(&_hog_descriptor[xb + 8 + offset_y]), + vld1q_f32(&_hog_descriptor[xb + 12 + offset_y]) + } + }; + + // Multiply accumulate + score_f32 = vmlaq_f32(score_f32, a_f32.val[0], b_f32.val[0]); + score_f32 = vmlaq_f32(score_f32, a_f32.val[1], b_f32.val[1]); + score_f32 = vmlaq_f32(score_f32, a_f32.val[2], b_f32.val[2]); + score_f32 = vmlaq_f32(score_f32, a_f32.val[3], b_f32.val[3]); + } + + for(; xb < static_cast(_num_bins_per_descriptor_x); ++xb) + { + const float a = in_row_ptr[xb]; + const float b = _hog_descriptor[xb + offset_y]; + + score += a * b; + } + } + + score += vgetq_lane_f32(score_f32, 0); + score += vgetq_lane_f32(score_f32, 1); + score += vgetq_lane_f32(score_f32, 2); + score += vgetq_lane_f32(score_f32, 3); + + if(score > _threshold) + { + if(_detection_windows->num_values() < _max_num_detection_windows) + { + DetectionWindow win; + win.x = (id.x() * _block_stride_width); + win.y = (id.y() * _block_stride_height); + win.width = _detection_window_width; + win.height = _detection_window_height; + win.idx_class = _idx_class; + win.score = score; + + std::unique_lock lock(_mutex); + _detection_windows->push_back(win); + lock.unlock(); + } + } + }, + in); +} diff --git a/src/core/NEON/kernels/NEHarrisCornersKernel.cpp b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp new file mode 100644 index 0000000000..585676bb87 --- /dev/null +++ b/src/core/NEON/kernels/NEHarrisCornersKernel.cpp @@ -0,0 +1,1137 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +#ifdef ARM_COMPUTE_ENABLE_FP16 + +template class arm_compute::NEHarrisScoreFP16Kernel<3>; +template class arm_compute::NEHarrisScoreFP16Kernel<5>; +template class arm_compute::NEHarrisScoreFP16Kernel<7>; + +namespace fp16 +{ +inline float16x8_t harris_score(float16x8_t gx2, float16x8_t gy2, float16x8_t gxgy, float sensitivity, float strength_thresh) +{ + static const float16x8_t zero = vdupq_n_f16(0.f); + + // Trace^2 + float16x8_t trace2 = vaddq_f16(gx2, gy2); + trace2 = vmulq_f16(trace2, trace2); + + // Det(A) + float16x8_t det = vmulq_f16(gx2, gy2); + det = vfmsq_f16(det, gxgy, gxgy); + + // Det(A) - sensitivity * trace^2 + const float16x8_t mc = vfmsq_f16(det, vdupq_n_f16(sensitivity), trace2); + + // mc > strength_thresh + const uint16x8_t mask = vcgtq_f16(mc, vdupq_n_f16(strength_thresh)); + + return vbslq_f16(mask, mc, zero); +} + +template +inline void harris_score1xN_FLOAT_FLOAT_FLOAT(float16x8_t low_gx, float16x8_t low_gy, float16x8_t high_gx, float16x8_t high_gy, float16x8_t &gx2, float16x8_t &gy2, float16x8_t &gxgy, + float norm_factor) +{ + const float16x8_t norm_factor_fp16 = vdupq_n_f16(norm_factor); + + // Normalize + low_gx = vmulq_f16(low_gx, norm_factor_fp16); + low_gy = vmulq_f16(low_gy, norm_factor_fp16); + high_gx = vmulq_f16(high_gx, norm_factor_fp16); + high_gy = vmulq_f16(high_gy, norm_factor_fp16); + + float16x8_t gx = vextq_f16(low_gx, high_gx, 0); + float16x8_t gy = vextq_f16(low_gy, high_gy, 0); + + gx2 = vfmaq_f16(gx2, gx, gx); + gy2 = vfmaq_f16(gy2, gy, gy); + gxgy = vfmaq_f16(gxgy, gx, gy); + + gx = vextq_f16(low_gx, high_gx, 1); + gy = vextq_f16(low_gy, high_gy, 1); + + gx2 = vfmaq_f16(gx2, gx, gx); + gy2 = vfmaq_f16(gy2, gy, gy); + gxgy = vfmaq_f16(gxgy, gx, gy); + + gx = vextq_f16(low_gx, high_gx, 2); + gy = vextq_f16(low_gy, high_gy, 2); + + gx2 = vfmaq_f16(gx2, gx, gx); + gy2 = vfmaq_f16(gy2, gy, gy); + gxgy = vfmaq_f16(gxgy, gx, gy); + + if(block_size > 3) + { + gx = vextq_f16(low_gx, high_gx, 3); + gy = vextq_f16(low_gy, high_gy, 3); + + gx2 = vfmaq_f16(gx2, gx, gx); + gy2 = vfmaq_f16(gy2, gy, gy); + gxgy = vfmaq_f16(gxgy, gx, gy); + + gx = vextq_f16(low_gx, high_gx, 4); + gy = vextq_f16(low_gy, high_gy, 4); + + gx2 = vfmaq_f16(gx2, gx, gx); + gy2 = vfmaq_f16(gy2, gy, gy); + gxgy = vfmaq_f16(gxgy, gx, gy); + } + + if(block_size == 7) + { + gx = vextq_f16(low_gx, high_gx, 5); + gy = vextq_f16(low_gy, high_gy, 5); + + gx2 = vfmaq_f16(gx2, gx, gx); + gy2 = vfmaq_f16(gy2, gy, gy); + gxgy = vfmaq_f16(gxgy, gx, gy); + + gx = vextq_f16(low_gx, high_gx, 6); + gy = vextq_f16(low_gy, high_gy, 6); + + gx2 = vfmaq_f16(gx2, gx, gx); + gy2 = vfmaq_f16(gy2, gy, gy); + gxgy = vfmaq_f16(gxgy, gx, gy); + } +} + +template +inline void harris_score_S16_S16_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity, + float strength_thresh) +{ + auto gx_ptr_0 = static_cast(in1_ptr) - (block_size / 2) * (in_stride + 1); + auto gy_ptr_0 = static_cast(in2_ptr) - (block_size / 2) * (in_stride + 1); + const int16_t *gx_ptr_1 = gx_ptr_0 + 8; + const int16_t *gy_ptr_1 = gy_ptr_0 + 8; + const auto output = static_cast(out_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float16x8_t gx2 = vdupq_n_f16(0.0f); + float16x8_t gy2 = vdupq_n_f16(0.0f); + float16x8_t gxgy = vdupq_n_f16(0.0f); + + for(size_t i = 0; i < block_size; ++i) + { + const float16x8_t low_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_0)); + const float16x8_t high_gx = vcvtq_f16_s16(vld1q_s16(gx_ptr_1)); + const float16x8_t low_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_0)); + const float16x8_t high_gy = vcvtq_f16_s16(vld1q_s16(gy_ptr_1)); + harris_score1xN_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor); + + // Update gx and gy pointer + gx_ptr_0 += in_stride; + gy_ptr_0 += in_stride; + gx_ptr_1 += in_stride; + gy_ptr_1 += in_stride; + } + + // Calculate harris score + const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh); + + // Store score + vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc))); + vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc))); +} + +template +inline void harris_score_S32_S32_FLOAT(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity, + float strength_thresh) +{ + static const float16x8_t zero = vdupq_n_f16(0.0f); + + auto gx_ptr_0 = static_cast(in1_ptr) - (block_size / 2) * (in_stride + 1); + auto gy_ptr_0 = static_cast(in2_ptr) - (block_size / 2) * (in_stride + 1); + const int32_t *gx_ptr_1 = gx_ptr_0 + 4; + const int32_t *gy_ptr_1 = gy_ptr_0 + 4; + const int32_t *gx_ptr_2 = gx_ptr_0 + 8; + const int32_t *gy_ptr_2 = gy_ptr_0 + 8; + const auto output = static_cast(out_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float16x8_t gx2 = zero; + float16x8_t gy2 = zero; + float16x8_t gxgy = zero; + + for(size_t i = 0; i < block_size; ++i) + { + const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))), + vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1)))); + const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))), + vget_low_f16(zero)); + const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))), + vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1)))); + const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))), + vget_low_f16(zero)); + harris_score1xN_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor); + + // Update gx and gy pointer + gx_ptr_0 += in_stride; + gy_ptr_0 += in_stride; + gx_ptr_1 += in_stride; + gy_ptr_1 += in_stride; + gx_ptr_2 += in_stride; + gy_ptr_2 += in_stride; + } + + // Calculate harris score + const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh); + + // Store score + vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc))); + vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc))); +} + +template <> +inline void harris_score_S32_S32_FLOAT<7>(const void *__restrict in1_ptr, const void *__restrict in2_ptr, void *__restrict out_ptr, int32_t in_stride, float norm_factor, float sensitivity, + float strength_thresh) +{ + static const float16x8_t zero = vdupq_n_f16(0.0f); + + auto gx_ptr_0 = static_cast(in1_ptr) - 3 * (in_stride + 1); + auto gy_ptr_0 = static_cast(in2_ptr) - 3 * (in_stride + 1); + const int32_t *gx_ptr_1 = gx_ptr_0 + 4; + const int32_t *gy_ptr_1 = gy_ptr_0 + 4; + const int32_t *gx_ptr_2 = gx_ptr_0 + 8; + const int32_t *gy_ptr_2 = gy_ptr_0 + 8; + const int32_t *gx_ptr_3 = gx_ptr_0 + 12; + const int32_t *gy_ptr_3 = gy_ptr_0 + 12; + const auto output = static_cast(out_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float16x8_t gx2 = zero; + float16x8_t gy2 = zero; + float16x8_t gxgy = zero; + + for(size_t i = 0; i < 7; ++i) + { + const float16x8_t low_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_0))), + vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_1)))); + const float16x8_t high_gx = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_2))), + vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gx_ptr_3)))); + const float16x8_t low_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_0))), + vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_1)))); + const float16x8_t high_gy = vcombine_f16(vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_2))), + vcvt_f16_f32(vcvtq_f32_s32(vld1q_s32(gy_ptr_3)))); + harris_score1xN_FLOAT_FLOAT_FLOAT<7>(low_gx, low_gy, high_gx, high_gy, gx2, gy2, gxgy, norm_factor); + + // Update gx and gy pointer + gx_ptr_0 += in_stride; + gy_ptr_0 += in_stride; + gx_ptr_1 += in_stride; + gy_ptr_1 += in_stride; + gx_ptr_2 += in_stride; + gy_ptr_2 += in_stride; + } + + // Calculate harris score + const float16x8_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh); + + // Store score + vst1q_f32(output + 0, vcvt_f32_f16(vget_low_f16(mc))); + vst1q_f32(output + 4, vcvt_f32_f16(vget_high_f16(mc))); +} + +} // namespace fp16 + +template +BorderSize NEHarrisScoreFP16Kernel::border_size() const +{ + return _border_size; +} + +template +NEHarrisScoreFP16Kernel::NEHarrisScoreFP16Kernel() + : INEHarrisScoreKernel(), _func(nullptr) +{ +} + +template +void NEHarrisScoreFP16Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + Iterator input1(_input1, window); + Iterator input2(_input2, window); + Iterator output(_output, window); + + const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type()); + + execute_window_loop(window, [&](const Coordinates & id) + { + (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh); + }, + input1, input2, output); +} + +template +void NEHarrisScoreFP16Kernel::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, + bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON(0.0f == norm_factor); + + _input1 = input1; + _input2 = input2; + _output = output; + _sensitivity = sensitivity; + _strength_thresh = strength_thresh; + _norm_factor = norm_factor; + _border_size = BorderSize(block_size / 2); + + if(input1->info()->data_type() == DataType::S16) + { + _func = &fp16::harris_score_S16_S16_FLOAT; + } + else + { + _func = &fp16::harris_score_S32_S32_FLOAT; + } + + ARM_COMPUTE_ERROR_ON(nullptr == _func); + + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = block_size; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration), + AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region, border_undefined, border_size()); + + INEKernel::configure(win); +} + +#endif + +template class arm_compute::NEHarrisScoreKernel<3>; +template class arm_compute::NEHarrisScoreKernel<5>; +template class arm_compute::NEHarrisScoreKernel<7>; +template arm_compute::NEHarrisScoreKernel<3>::NEHarrisScoreKernel(); +template arm_compute::NEHarrisScoreKernel<5>::NEHarrisScoreKernel(); +template arm_compute::NEHarrisScoreKernel<7>::NEHarrisScoreKernel(); + +namespace +{ +inline float32x4_t harris_score(float32x4_t gx2, float32x4_t gy2, float32x4_t gxgy, float32x4_t sensitivity, float32x4_t strength_thresh) +{ + // Trace^2 + float32x4_t trace2 = vaddq_f32(gx2, gy2); + trace2 = vmulq_f32(trace2, trace2); + + // Det(A) + float32x4_t det = vmulq_f32(gx2, gy2); + det = vmlsq_f32(det, gxgy, gxgy); + + // Det(A) - sensitivity * trace^2 + const float32x4_t mc = vmlsq_f32(det, sensitivity, trace2); + + // mc > strength_thresh + const uint32x4_t mask = vcgtq_f32(mc, strength_thresh); + + return vbslq_f32(mask, mc, vdupq_n_f32(0.0f)); +} + +inline void harris_score1x3_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy, + float32x4_t norm_factor) +{ + // Normalize + low_gx = vmulq_f32(low_gx, norm_factor); + low_gy = vmulq_f32(low_gy, norm_factor); + high_gx = vmulq_f32(high_gx, norm_factor); + high_gy = vmulq_f32(high_gy, norm_factor); + + const float32x4_t l_gx = low_gx; + const float32x4_t l_gy = low_gy; + const float32x4_t m_gx = vextq_f32(low_gx, high_gx, 1); + const float32x4_t m_gy = vextq_f32(low_gy, high_gy, 1); + const float32x4_t r_gx = vextq_f32(low_gx, high_gx, 2); + const float32x4_t r_gy = vextq_f32(low_gy, high_gy, 2); + + // Gx*Gx + gx2 = vmlaq_f32(gx2, l_gx, l_gx); + gx2 = vmlaq_f32(gx2, m_gx, m_gx); + gx2 = vmlaq_f32(gx2, r_gx, r_gx); + + // Gy*Gy + gy2 = vmlaq_f32(gy2, l_gy, l_gy); + gy2 = vmlaq_f32(gy2, m_gy, m_gy); + gy2 = vmlaq_f32(gy2, r_gy, r_gy); + + // Gx*Gy + gxgy = vmlaq_f32(gxgy, l_gx, l_gy); + gxgy = vmlaq_f32(gxgy, m_gx, m_gy); + gxgy = vmlaq_f32(gxgy, r_gx, r_gy); +} + +inline void harris_score1x5_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t &gx2, float32x4_t &gy2, float32x4_t &gxgy, + float32x4_t norm_factor) +{ + // Normalize + low_gx = vmulq_f32(low_gx, norm_factor); + low_gy = vmulq_f32(low_gy, norm_factor); + high_gx = vmulq_f32(high_gx, norm_factor); + high_gy = vmulq_f32(high_gy, norm_factor); + + // L2 values + float32x4_t gx = low_gx; + float32x4_t gy = low_gy; + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // L1 values + gx = vextq_f32(low_gx, high_gx, 1); + gy = vextq_f32(low_gy, high_gy, 1); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // M values + gx = vextq_f32(low_gx, high_gx, 2); + gy = vextq_f32(low_gy, high_gy, 2); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // R1 values + gx = vextq_f32(low_gx, high_gx, 3); + gy = vextq_f32(low_gy, high_gy, 3); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // R2 values + gx = high_gx; + gy = high_gy; + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); +} + +inline void harris_score1x7_FLOAT_FLOAT_FLOAT(float32x4_t low_gx, float32x4_t low_gy, float32x4_t high_gx, float32x4_t high_gy, float32x4_t high_gx1, float32x4_t high_gy1, float32x4_t &gx2, + float32x4_t &gy2, float32x4_t &gxgy, float32x4_t norm_factor) +{ + // Normalize + low_gx = vmulq_f32(low_gx, norm_factor); + low_gy = vmulq_f32(low_gy, norm_factor); + high_gx = vmulq_f32(high_gx, norm_factor); + high_gy = vmulq_f32(high_gy, norm_factor); + + // L3 values + float32x4_t gx = low_gx; + float32x4_t gy = low_gy; + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // L2 values + gx = vextq_f32(low_gx, high_gx, 1); + gy = vextq_f32(low_gy, high_gy, 1); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // L1 values + gx = vextq_f32(low_gx, high_gx, 2); + gy = vextq_f32(low_gy, high_gy, 2); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // M values + gx = vextq_f32(low_gx, high_gx, 3); + gy = vextq_f32(low_gy, high_gy, 3); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // R1 values + gx = high_gx; + gy = high_gy; + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // Change tmp_low and tmp_high for calculating R2 and R3 values + low_gx = high_gx; + low_gy = high_gy; + high_gx = high_gx1; + high_gy = high_gy1; + + // Normalize + high_gx = vmulq_f32(high_gx, norm_factor); + high_gy = vmulq_f32(high_gy, norm_factor); + + // R2 values + gx = vextq_f32(low_gx, high_gx, 1); + gy = vextq_f32(low_gy, high_gy, 1); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); + + // R3 values + gx = vextq_f32(low_gx, high_gx, 2); + gy = vextq_f32(low_gy, high_gy, 2); + + // Accumulate + gx2 = vmlaq_f32(gx2, gx, gx); + gy2 = vmlaq_f32(gy2, gy, gy); + gxgy = vmlaq_f32(gxgy, gx, gy); +} + +inline void harris_score3x3_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float in_norm_factor, float in_sensitivity, float in_strength_thresh) + +{ + const auto gx_ptr_0 = static_cast(input1_ptr) - 1; + const auto gy_ptr_0 = static_cast(input2_ptr) - 1; + const int16_t *gx_ptr_1 = gx_ptr_0 + 4; + const int16_t *gy_ptr_1 = gy_ptr_0 + 4; + const auto output = static_cast(output_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float32x4x2_t gx2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gy2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gxgy = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + + // Row0 + int16x8x2_t tmp_gx = + { + { + vld1q_s16(gx_ptr_0 - input_stride), + vld1q_s16(gx_ptr_1 - input_stride) + } + }; + int16x8x2_t tmp_gy = + { + { + vld1q_s16(gy_ptr_0 - input_stride), + vld1q_s16(gy_ptr_1 - input_stride) + } + }; + float32x4_t sensitivity = vdupq_n_f32(in_sensitivity); + float32x4_t norm_factor = vdupq_n_f32(in_norm_factor); + float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh); + + float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0]))); + float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0]))); + float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0]))); + float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0]))); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1]))); + low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1]))); + high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1]))); + high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1]))); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Row1 + tmp_gx.val[0] = vld1q_s16(gx_ptr_0); + tmp_gy.val[0] = vld1q_s16(gy_ptr_0); + tmp_gx.val[1] = vld1q_s16(gx_ptr_1); + tmp_gy.val[1] = vld1q_s16(gy_ptr_1); + + low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0]))); + low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0]))); + high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0]))); + high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0]))); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1]))); + low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1]))); + high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1]))); + high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1]))); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Row2 + tmp_gx.val[0] = vld1q_s16(gx_ptr_0 + input_stride); + tmp_gy.val[0] = vld1q_s16(gy_ptr_0 + input_stride); + tmp_gx.val[1] = vld1q_s16(gx_ptr_1 + input_stride); + tmp_gy.val[1] = vld1q_s16(gy_ptr_1 + input_stride); + + low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0]))); + low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0]))); + high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0]))); + high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0]))); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1]))); + low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1]))); + high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1]))); + high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1]))); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Calculate harris score + const float32x4x2_t mc = + { + { + harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh), + harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh) + } + }; + + // Store score + vst1q_f32(output + 0, mc.val[0]); + vst1q_f32(output + 4, mc.val[1]); +} + +inline void harris_score3x3_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float in_norm_factor, float in_sensitivity, float in_strength_thresh) +{ + auto gx_ptr_0 = static_cast(input1_ptr) - 1; + auto gy_ptr_0 = static_cast(input2_ptr) - 1; + const int32_t *gx_ptr_1 = gx_ptr_0 + 4; + const int32_t *gy_ptr_1 = gy_ptr_0 + 4; + const int32_t *gx_ptr_2 = gx_ptr_0 + 8; + const int32_t *gy_ptr_2 = gy_ptr_0 + 8; + const auto output = static_cast(output_ptr); + float32x4_t sensitivity = vdupq_n_f32(in_sensitivity); + float32x4_t norm_factor = vdupq_n_f32(in_norm_factor); + float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh); + + // Gx^2, Gy^2 and Gx*Gy + float32x4x2_t gx2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gy2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gxgy = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + + // Row0 + float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 - input_stride)); + float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 - input_stride)); + float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride)); + float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride)); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 - input_stride)); + low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 - input_stride)); + high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 - input_stride)); + high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 - input_stride)); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Row1 + low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0)); + low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0)); + high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1)); + high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1)); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1)); + low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1)); + high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2)); + high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2)); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Row2 + low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0 + input_stride)); + low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0 + input_stride)); + high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride)); + high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride)); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1 + input_stride)); + low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1 + input_stride)); + high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_2 + input_stride)); + high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_2 + input_stride)); + harris_score1x3_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Calculate harris score + const float32x4x2_t mc = + { + { + harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh), + harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh) + } + }; + + // Store score + vst1q_f32(output + 0, mc.val[0]); + vst1q_f32(output + 4, mc.val[1]); +} + +inline void harris_score5x5_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float in_norm_factor, float in_sensitivity, float in_strength_thresh) +{ + auto gx_ptr_0 = static_cast(input1_ptr) - 2 - 2 * input_stride; + auto gy_ptr_0 = static_cast(input2_ptr) - 2 - 2 * input_stride; + const int16_t *gx_ptr_1 = gx_ptr_0 + 4; + const int16_t *gy_ptr_1 = gy_ptr_0 + 4; + const auto output = static_cast(output_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float32x4x2_t gx2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gy2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gxgy = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4_t sensitivity = vdupq_n_f32(in_sensitivity); + float32x4_t norm_factor = vdupq_n_f32(in_norm_factor); + float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh); + + for(int i = 0; i < 5; ++i) + { + const int16x8x2_t tmp_gx = + { + { + vld1q_s16(gx_ptr_0), + vld1q_s16(gx_ptr_1) + } + }; + const int16x8x2_t tmp_gy = + { + { + vld1q_s16(gy_ptr_0), + vld1q_s16(gy_ptr_1) + } + }; + + float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[0]))); + float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[0]))); + float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[0]))); + float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[0]))); + harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gx.val[1]))); + low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp_gy.val[1]))); + high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gx.val[1]))); + high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp_gy.val[1]))); + harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Update gx and gy pointer + gx_ptr_0 += input_stride; + gy_ptr_0 += input_stride; + gx_ptr_1 += input_stride; + gy_ptr_1 += input_stride; + } + + // Calculate harris score + const float32x4x2_t mc = + { + { + harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh), + harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh) + } + }; + + // Store score + vst1q_f32(output + 0, mc.val[0]); + vst1q_f32(output + 4, mc.val[1]); +} + +inline void harris_score5x5_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float in_norm_factor, float in_sensitivity, float in_strength_thresh) + +{ + auto gx_ptr_0 = static_cast(input1_ptr) - 2 - 2 * input_stride; + auto gy_ptr_0 = static_cast(input2_ptr) - 2 - 2 * input_stride; + const int32_t *gx_ptr_1 = gx_ptr_0 + 4; + const int32_t *gy_ptr_1 = gy_ptr_0 + 4; + const int32_t *gx_ptr_2 = gx_ptr_0 + 8; + const int32_t *gy_ptr_2 = gy_ptr_0 + 8; + const auto output = static_cast(output_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float32x4x2_t gx2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gy2 = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4x2_t gxgy = + { + { + vdupq_n_f32(0.0f), + vdupq_n_f32(0.0f) + } + }; + float32x4_t sensitivity = vdupq_n_f32(in_sensitivity); + float32x4_t norm_factor = vdupq_n_f32(in_norm_factor); + float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh); + + for(int i = 0; i < 5; ++i) + { + const float32x4_t low_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_0)); + const float32x4_t low_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_0)); + const float32x4_t high_gx_0 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1)); + const float32x4_t high_gy_0 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1)); + harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_0, low_gy_0, high_gx_0, high_gy_0, gx2.val[0], gy2.val[0], gxgy.val[0], norm_factor); + + const float32x4_t low_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_1)); + const float32x4_t low_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_1)); + const float32x4_t high_gx_1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2)); + const float32x4_t high_gy_1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2)); + harris_score1x5_FLOAT_FLOAT_FLOAT(low_gx_1, low_gy_1, high_gx_1, high_gy_1, gx2.val[1], gy2.val[1], gxgy.val[1], norm_factor); + + // Update gx and gy pointer + gx_ptr_0 += input_stride; + gy_ptr_0 += input_stride; + gx_ptr_1 += input_stride; + gy_ptr_1 += input_stride; + gx_ptr_2 += input_stride; + gy_ptr_2 += input_stride; + } + + // Calculate harris score + const float32x4x2_t mc = + { + { + harris_score(gx2.val[0], gy2.val[0], gxgy.val[0], sensitivity, strength_thresh), + harris_score(gx2.val[1], gy2.val[1], gxgy.val[1], sensitivity, strength_thresh) + } + }; + + // Store score + vst1q_f32(output + 0, mc.val[0]); + vst1q_f32(output + 4, mc.val[1]); +} + +inline void harris_score7x7_S16_S16_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float in_norm_factor, float in_sensitivity, float in_strength_thresh) +{ + auto gx_ptr_0 = static_cast(input1_ptr) - 3 - 3 * input_stride; + auto gy_ptr_0 = static_cast(input2_ptr) - 3 - 3 * input_stride; + const int16_t *gx_ptr_1 = gx_ptr_0 + 8; + const int16_t *gy_ptr_1 = gy_ptr_0 + 8; + const auto output = static_cast(output_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float32x4_t gx2 = vdupq_n_f32(0.0f); + float32x4_t gy2 = vdupq_n_f32(0.0f); + float32x4_t gxgy = vdupq_n_f32(0.0f); + float32x4_t sensitivity = vdupq_n_f32(in_sensitivity); + float32x4_t norm_factor = vdupq_n_f32(in_norm_factor); + float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh); + + for(int i = 0; i < 7; ++i) + { + const int16x8_t tmp0_gx = vld1q_s16(gx_ptr_0); + const int16x8_t tmp0_gy = vld1q_s16(gy_ptr_0); + const int16x4_t tmp1_gx = vld1_s16(gx_ptr_1); + const int16x4_t tmp1_gy = vld1_s16(gy_ptr_1); + + float32x4_t low_gx = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gx))); + float32x4_t low_gy = vcvtq_f32_s32(vmovl_s16(vget_low_s16(tmp0_gy))); + float32x4_t high_gx = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gx))); + float32x4_t high_gy = vcvtq_f32_s32(vmovl_s16(vget_high_s16(tmp0_gy))); + float32x4_t high_gx1 = vcvtq_f32_s32(vmovl_s16(tmp1_gx)); + float32x4_t high_gy1 = vcvtq_f32_s32(vmovl_s16(tmp1_gy)); + harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor); + + // Update gx and gy pointer + gx_ptr_0 += input_stride; + gy_ptr_0 += input_stride; + gx_ptr_1 += input_stride; + gy_ptr_1 += input_stride; + } + + // Calculate harris score + const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh); + + // Store score + vst1q_f32(output, mc); +} + +inline void harris_score7x7_S32_S32_FLOAT(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float in_norm_factor, float in_sensitivity, float in_strength_thresh) +{ + auto gx_ptr_0 = static_cast(input1_ptr) - 3 - 3 * input_stride; + auto gy_ptr_0 = static_cast(input2_ptr) - 3 - 3 * input_stride; + const int32_t *gx_ptr_1 = gx_ptr_0 + 4; + const int32_t *gy_ptr_1 = gy_ptr_0 + 4; + const int32_t *gx_ptr_2 = gx_ptr_1 + 4; + const int32_t *gy_ptr_2 = gy_ptr_1 + 4; + const auto output = static_cast(output_ptr); + + // Gx^2, Gy^2 and Gx*Gy + float32x4_t gx2 = vdupq_n_f32(0.0f); + float32x4_t gy2 = vdupq_n_f32(0.0f); + float32x4_t gxgy = vdupq_n_f32(0.0f); + float32x4_t sensitivity = vdupq_n_f32(in_sensitivity); + float32x4_t norm_factor = vdupq_n_f32(in_norm_factor); + float32x4_t strength_thresh = vdupq_n_f32(in_strength_thresh); + + for(int i = 0; i < 7; ++i) + { + const float32x4_t low_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_0)); + const float32x4_t low_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_0)); + const float32x4_t high_gx = vcvtq_f32_s32(vld1q_s32(gx_ptr_1)); + const float32x4_t high_gy = vcvtq_f32_s32(vld1q_s32(gy_ptr_1)); + const float32x4_t high_gx1 = vcvtq_f32_s32(vld1q_s32(gx_ptr_2)); + const float32x4_t high_gy1 = vcvtq_f32_s32(vld1q_s32(gy_ptr_2)); + harris_score1x7_FLOAT_FLOAT_FLOAT(low_gx, low_gy, high_gx, high_gy, high_gx1, high_gy1, gx2, gy2, gxgy, norm_factor); + + // Update gx and gy pointer + gx_ptr_0 += input_stride; + gy_ptr_0 += input_stride; + gx_ptr_1 += input_stride; + gy_ptr_1 += input_stride; + gx_ptr_2 += input_stride; + gy_ptr_2 += input_stride; + } + + // Calculate harris score + const float32x4_t mc = harris_score(gx2, gy2, gxgy, sensitivity, strength_thresh); + + // Store score + vst1q_f32(output, mc); +} + +} // namespace + +INEHarrisScoreKernel::INEHarrisScoreKernel() + : _input1(nullptr), _input2(nullptr), _output(nullptr), _sensitivity(0.0f), _strength_thresh(0.0f), _norm_factor(0.0f), _border_size() +{ +} + +template +NEHarrisScoreKernel::NEHarrisScoreKernel() + : INEHarrisScoreKernel(), _func(nullptr) +{ +} + +template +void NEHarrisScoreKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + Iterator input1(_input1, window); + Iterator input2(_input2, window); + Iterator output(_output, window); + + const size_t input_stride = _input1->info()->strides_in_bytes()[1] / element_size_from_data_type(_input1->info()->data_type()); + + execute_window_loop(window, [&](const Coordinates & id) + { + (*_func)(input1.ptr(), input2.ptr(), output.ptr(), input_stride, _norm_factor, _sensitivity, _strength_thresh); + }, + input1, input2, output); +} + +template +BorderSize NEHarrisScoreKernel::border_size() const +{ + return _border_size; +} + +template +void NEHarrisScoreKernel::configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, + bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input1); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input2); + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::S16, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2); + ARM_COMPUTE_ERROR_ON(0.0f == norm_factor); + + _input1 = input1; + _input2 = input2; + _output = output; + _sensitivity = sensitivity; + _strength_thresh = strength_thresh; + _norm_factor = norm_factor; + _border_size = BorderSize(block_size / 2); + + if(input1->info()->data_type() == DataType::S16) + { + switch(block_size) + { + case 3: + _func = &harris_score3x3_S16_S16_FLOAT; + break; + case 5: + _func = &harris_score5x5_S16_S16_FLOAT; + break; + case 7: + _func = &harris_score7x7_S16_S16_FLOAT; + break; + default: + ARM_COMPUTE_ERROR("Invalid block size"); + break; + } + } + else + { + switch(block_size) + { + case 3: + _func = &harris_score3x3_S32_S32_FLOAT; + break; + case 5: + _func = &harris_score5x5_S32_S32_FLOAT; + break; + case 7: + _func = &harris_score7x7_S32_S32_FLOAT; + break; + default: + ARM_COMPUTE_ERROR("Invalid block size"); + break; + } + } + + ARM_COMPUTE_ERROR_ON(nullptr == _func); + + constexpr unsigned int num_elems_processed_per_iteration = block_size != 7 ? 8 : 4; + constexpr unsigned int num_elems_read_per_iteration = block_size != 7 ? 16 : 12; + constexpr unsigned int num_elems_written_per_iteration = block_size != 7 ? 8 : 4; + constexpr unsigned int num_rows_read_per_iteration = block_size; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input1->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration), + AccessWindowRectangle(input2->info(), -_border_size.left, -_border_size.top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region, border_undefined, border_size()); + + INEKernel::configure(win); +} diff --git a/src/core/NEON/kernels/NEHistogramKernel.cpp b/src/core/NEON/kernels/NEHistogramKernel.cpp new file mode 100644 index 0000000000..9e967ec4f5 --- /dev/null +++ b/src/core/NEON/kernels/NEHistogramKernel.cpp @@ -0,0 +1,252 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IDistribution1D.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +inline void NEHistogramKernel::merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins) +{ + std::lock_guard lock(_hist_mtx); + + const unsigned int v_end = (bins / 4) * 4; + + for(unsigned int b = 0; b < v_end; b += 4) + { + const uint32x4_t tmp_global = vld1q_u32(global_hist + b); + const uint32x4_t tmp_local = vld1q_u32(local_hist + b); + vst1q_u32(global_hist + b, vaddq_u32(tmp_global, tmp_local)); + } + + for(unsigned int b = v_end; b < bins; ++b) + { + global_hist[b] += local_hist[b]; + } +} + +NEHistogramKernel::NEHistogramKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _local_hist(nullptr), _window_lut(nullptr), _hist_mtx() +{ +} + +void NEHistogramKernel::histogram_U8(Window win) +{ + ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); + + const size_t bins = _output->num_bins(); + const int32_t offset = _output->offset(); + const uint32_t offrange = offset + _output->range(); + const uint32_t *const w_lut = _window_lut; + uint32_t *const local_hist = _local_hist + win.thread_id() * bins; + + // Clear local_histogram + std::fill_n(local_hist, bins, 0); + + auto update_local_hist = [&](uint8_t p) + { + if(offset <= p && p < offrange) + { + ++local_hist[w_lut[p]]; + } + }; + + const unsigned int x_start = win.x().start(); + const unsigned int x_end = win.x().end(); + + // Handle X dimension manually to split into two loops + // First one will use vector operations, second one processes the left over + // pixels + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win); + + // Calculate local histogram + execute_window_loop(win, [&](const Coordinates &) + { + unsigned int x = x_start; + + // Vector loop + for(; x <= x_end - 8; x += 8) + { + const uint8x8_t pixels = vld1_u8(input.ptr() + x); + + update_local_hist(vget_lane_u8(pixels, 0)); + update_local_hist(vget_lane_u8(pixels, 1)); + update_local_hist(vget_lane_u8(pixels, 2)); + update_local_hist(vget_lane_u8(pixels, 3)); + update_local_hist(vget_lane_u8(pixels, 4)); + update_local_hist(vget_lane_u8(pixels, 5)); + update_local_hist(vget_lane_u8(pixels, 6)); + update_local_hist(vget_lane_u8(pixels, 7)); + } + + // Process leftover pixels + for(; x < x_end; ++x) + { + update_local_hist(input.ptr()[x]); + } + }, + input); + + // Merge histograms + merge_histogram(_output->buffer(), local_hist, bins); +} + +void NEHistogramKernel::histogram_fixed_U8(Window win) +{ + ARM_COMPUTE_ERROR_ON(_output->buffer() == nullptr); + + std::array local_hist{ { 0 } }; + + const unsigned int x_start = win.x().start(); + const unsigned int x_end = win.x().end(); + + // Handle X dimension manually to split into two loops + // First one will use vector operations, second one processes the left over + // pixels + win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input(_input, win); + + // Calculate local histogram + execute_window_loop(win, [&](const Coordinates &) + { + unsigned int x = x_start; + + // Vector loop + for(; x <= x_end - 8; x += 8) + { + const uint8x8_t pixels = vld1_u8(input.ptr() + x); + + ++local_hist[vget_lane_u8(pixels, 0)]; + ++local_hist[vget_lane_u8(pixels, 1)]; + ++local_hist[vget_lane_u8(pixels, 2)]; + ++local_hist[vget_lane_u8(pixels, 3)]; + ++local_hist[vget_lane_u8(pixels, 4)]; + ++local_hist[vget_lane_u8(pixels, 5)]; + ++local_hist[vget_lane_u8(pixels, 6)]; + ++local_hist[vget_lane_u8(pixels, 7)]; + } + + // Process leftover pixels + for(; x < x_end; ++x) + { + ++local_hist[input.ptr()[x]]; + } + }, + input); + + // Merge histograms + merge_histogram(_output->buffer(), local_hist.data(), _max_range_size); +} + +void NEHistogramKernel::calculate_window_lut() const +{ + const int32_t offset = _output->offset(); + const size_t bins = _output->num_bins(); + const uint32_t range = _output->range(); + + std::fill_n(_window_lut, offset, 0); + + for(unsigned int p = offset; p < _max_range_size; ++p) + { + _window_lut[p] = ((p - offset) * bins) / range; + } +} + +void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(nullptr == output); + ARM_COMPUTE_ERROR_ON(nullptr == local_hist); + ARM_COMPUTE_ERROR_ON(nullptr == window_lut); + + _input = input; + _output = output; + _local_hist = local_hist; + _window_lut = window_lut; + + //Check offset + ARM_COMPUTE_ERROR_ON_MSG(0 > _output->offset() || _output->offset() > static_cast(_max_range_size), "Offset is larger than the image value range."); + + //Check range + ARM_COMPUTE_ERROR_ON_MSG(static_cast(_output->range()) > static_cast(_max_range_size) /* max range */, "Range larger than the image value range."); + + // Calculate LUT + calculate_window_lut(); + + // Set appropriate function + _func = &NEHistogramKernel::histogram_U8; + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + INEKernel::configure(win); +} + +void NEHistogramKernel::configure(const IImage *input, IDistribution1D *output) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + _input = input; + _output = output; + + // Set appropriate function + _func = &NEHistogramKernel::histogram_fixed_U8; + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + INEKernel::configure(win); +} + +void NEHistogramKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/NEON/kernels/NEIm2ColKernel.cpp new file mode 100644 index 0000000000..c7c23d5d06 --- /dev/null +++ b/src/core/NEON/kernels/NEIm2ColKernel.cpp @@ -0,0 +1,338 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/FixedPoint.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include +#include +#include + +using namespace arm_compute; + +namespace +{ +template +inline void linearize_volume(const uint8_t *const in_ptr, + T *out_ptr, + bool has_bias, + int top_left_x, + int top_left_y, + int kernel_size, + int kernel_depth, + int input_w, + int input_h, + int input_stride_x, + int input_stride_y, + int input_stride_z, + int fixed_point_position) +{ + const int kernel_size2 = kernel_size * kernel_size; + const int x_e = top_left_x + kernel_size; + const int y_e = top_left_y + kernel_size; + + // Linearize volume + int d = 0; + // This for loop linearize a volume with 3 slices. This allows: + // 1) to reduce the iterations of the outer for loop "d" + // 2) to have an optimized im2col for the first convolution layer where usually we have 3 IFMs + for(; d <= (kernel_depth - 3); d += 3) + { + for(int y = top_left_y; y < y_e; ++y) + { + if((y < 0 || y >= input_h) && has_pads) + { + // All the values will be zeros + for(int x = top_left_x; x < x_e; ++x, ++out_ptr) + { + *(out_ptr + 0 * kernel_size2) = 0; + *(out_ptr + 1 * kernel_size2) = 0; + *(out_ptr + 2 * kernel_size2) = 0; + } + } + else + { + for(int x = top_left_x; x < x_e; ++x, ++out_ptr) + { + if((x < 0 || x >= input_w) && has_pads) + { + *(out_ptr + 0 * kernel_size2) = 0; + *(out_ptr + 1 * kernel_size2) = 0; + *(out_ptr + 2 * kernel_size2) = 0; + } + else + { + *(out_ptr + 0 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 0) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 1 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 1) * input_stride_z + y * input_stride_y + x * input_stride_x))); + *(out_ptr + 2 * kernel_size2) = *(reinterpret_cast(in_ptr + ((d + 2) * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + out_ptr += 2 * kernel_size2; + } + + // Left over + for(; d < kernel_depth; d++) + { + for(int y = top_left_y; y < y_e; ++y) + { + if((y < 0 || y >= input_h) && has_pads) + { + // All the values will be zeros + memset(out_ptr, 0, kernel_size * sizeof(T)); + out_ptr += kernel_size; + } + else + { + for(int x = top_left_x; x < x_e; ++x, ++out_ptr) + { + if((x < 0 || x >= input_w) && has_pads) + { + *out_ptr = 0; + } + else + { + *out_ptr = *(reinterpret_cast(in_ptr + (d * input_stride_z + y * input_stride_y + x * input_stride_x))); + } + } + } + } + } + + // Append 1 if the convolution layer has biases + if(has_bias) + { + if(std::is_same::value) + { + *out_ptr = scvt_qs8_f32(1.0f, fixed_point_position); + } + else + { + *out_ptr = static_cast(1); + } + } +} +} // namespace + +template +void NEIm2ColKernel::run_generic(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const int kernel_depth = _input->info()->dimension(2); + const int input_w = _input->info()->dimension(0); + const int input_h = _input->info()->dimension(1); + const int input_stride_x = _input->info()->strides_in_bytes().x(); + const int input_stride_y = _input->info()->strides_in_bytes().y(); + const int input_stride_z = _input->info()->strides_in_bytes().z(); + + int pad_x = 0; + int pad_y = 0; + int stride_x = 0; + int stride_y = 0; + std::tie(pad_x, pad_y) = _conv_info.pad(); + std::tie(stride_x, stride_y) = _conv_info.stride(); + + // Setup input window + const int start_x = -pad_x; + const int start_y = -pad_y; + + Window window_in(window); + // The first three dimensions of the input are increased by the inner loops + window_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + window_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Setup output window + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, _output->info()->dimension(0), _output->info()->strides_in_bytes().y() / _output->info()->element_size())); + window_out.set(Window::DimY, Window::Dimension(window.y().start() * _convolved_dims.first, window.y().end() * _convolved_dims.first, _convolved_dims.first)); + window_out.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(_input, window_in); + Iterator out(_output, window_out); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int top_left_x = id.x() * stride_x + start_x; + const int top_left_y = id.y() * stride_y + start_y; + + // Get pointers + const uint8_t *const input_ptr = in.ptr(); + auto output_ptr = reinterpret_cast(out.ptr()); + + // Linearize volume + linearize_volume(input_ptr, + output_ptr, + _has_bias, + top_left_x, + top_left_y, + static_cast(_kernel_size), + kernel_depth, + input_w, + input_h, + input_stride_x, + input_stride_y, + input_stride_z, + _input->info()->fixed_point_position()); + }, + in, out); +} + +template +void NEIm2ColKernel::run_reduced(const Window &window) +{ + const size_t in_width = _input->info()->dimension(0); + const size_t in_height = _input->info()->dimension(1); + const size_t out_step_x = in_width * _input->info()->element_size(); + const size_t out_step_y = out_step_x * in_height; + const size_t out_width = _output->info()->dimension(0); + + Window in_window(window); + in_window.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Window out_window; + out_window.use_tensor_dimensions(_output->info()); + out_window.set(Window::DimX, Window::Dimension(out_window.x().start(), out_window.x().end(), in_width)); + + Window in_slice = in_window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_1D(); + + do + { + Iterator in(_input, in_slice); + Iterator out(_output, out_slice); + + uint8_t *out_ptr = out.ptr(); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + memcpy(out_ptr + id.y() * out_step_x + id.z() * out_step_y, in.ptr(), out_step_x); + }, + in); + + // Add bias + if(_has_bias) + { + if(std::is_same::value) + { + *(reinterpret_cast(out_ptr) + out_width - 1) = scvt_qs8_f32(1.0f, _input->info()->fixed_point_position()); + } + else + { + *(reinterpret_cast(out_ptr) + out_width - 1) = static_cast(1); + } + } + } + while(in_window.slide_window_slice_3D(in_slice) && out_window.slide_window_slice_1D(out_slice)); +} + +NEIm2ColKernel::NEIm2ColKernel() + : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_size(0), _has_bias(false) +{ +} + +void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, std::pair convolved_dims, const PadStrideInfo &conv_info, bool has_bias) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + _convolved_dims = convolved_dims; + _conv_info = conv_info; + _kernel_size = std::sqrt((output->info()->dimension(0) - (has_bias ? 1 : 0)) / input->info()->dimension(2)); + _has_bias = has_bias; + + unsigned int pad_x, pad_y, stride_x, stride_y = 0; + std::tie(pad_x, pad_y) = conv_info.pad(); + std::tie(stride_x, stride_y) = conv_info.stride(); + + bool run_img2col_reduced = (output->info()->dimension(0) == (input->info()->dimension(0) * input->info()->dimension(1) * input->info()->dimension(2))) && (TensorShape::num_max_dimensions >= 4) + && (std::equal(input->info()->tensor_shape().cbegin() + 3, + input->info()->tensor_shape().cend(), + output->info()->tensor_shape().cbegin() + 1)) + && ((stride_x == 1) && (stride_y == 1) && (pad_x == 0) && (pad_y == 0)); + + Window window = calculate_max_window(*input->info(), Steps()); + + if(run_img2col_reduced) + { + switch(_input->info()->data_type()) + { + case DataType::F32: + _func = &NEIm2ColKernel::run_reduced; + break; + case DataType::QS8: + _func = &NEIm2ColKernel::run_reduced; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + } + else + { + switch(_input->info()->data_type()) + { + case DataType::F32: + _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic : &NEIm2ColKernel::run_generic; + break; + case DataType::QS8: + _func = ((pad_x == 0) && (pad_y == 0)) ? &NEIm2ColKernel::run_generic : &NEIm2ColKernel::run_generic; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported"); + break; + } + window.set(Window::DimX, Window::Dimension(0, _convolved_dims.first, 1)); + window.set(Window::DimY, Window::Dimension(0, _convolved_dims.second, 1)); + window.set(Window::DimZ, Window::Dimension(0, 1, 1)); + } + + // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + IKernel::configure(window); +} + +void NEIm2ColKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEIntegralImageKernel.cpp b/src/core/NEON/kernels/NEIntegralImageKernel.cpp new file mode 100644 index 0000000000..3b09a1bdbb --- /dev/null +++ b/src/core/NEON/kernels/NEIntegralImageKernel.cpp @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +using namespace arm_compute; + +void NEIntegralImageKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U32); + + _input = input; + _output = output; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + // The kernel is effectively reading 17 values from -1 as it loads 16 + // starting at -1 and also 16 starting at 0 + AccessWindowRectangle output_read_access(output->info(), -1, -1, num_elems_processed_per_iteration + 1, 1); + AccessWindowHorizontal output_write_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), + output_read_access, output_write_access); + + output_write_access.set_valid_region(win, input->info()->valid_region()); + + IKernel::configure(win); +} + +BorderSize NEIntegralImageKernel::border_size() const +{ + return BorderSize(1, 0, 0, 1); +} + +bool NEIntegralImageKernel::is_parallelisable() const +{ + return false; +} + +void NEIntegralImageKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + Iterator input(_input, window); + Iterator output(_output, window); + + const auto output_top_left = reinterpret_cast(_output->ptr_to_element(Coordinates(-1, -1))); + const auto output_top_mid = reinterpret_cast(_output->ptr_to_element(Coordinates(0, -1))); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t input_pixels = vld1q_u8(input.ptr()); + + const uint16x8x2_t tmp = + { + { + vmovl_u8(vget_low_u8(input_pixels)), + vmovl_u8(vget_high_u8(input_pixels)) + } + }; + + uint32x4x4_t pixels = + { + { + vmovl_u16(vget_low_u16(tmp.val[0])), + vmovl_u16(vget_high_u16(tmp.val[0])), + vmovl_u16(vget_low_u16(tmp.val[1])), + vmovl_u16(vget_high_u16(tmp.val[1])) + } + }; + + // Divide by four as pointer is now uint32 instead of uint8! + const size_t off = output.offset() / 4; + + // Add top mid pixel values + const uint32_t *const top_mid_ptr = output_top_mid + off; + + pixels.val[0] = vaddq_u32(vld1q_u32(top_mid_ptr), pixels.val[0]); + pixels.val[1] = vaddq_u32(vld1q_u32(top_mid_ptr + 4), pixels.val[1]); + pixels.val[2] = vaddq_u32(vld1q_u32(top_mid_ptr + 8), pixels.val[2]); + pixels.val[3] = vaddq_u32(vld1q_u32(top_mid_ptr + 12), pixels.val[3]); + + // Subtract top left diagonal values + const auto outptr = reinterpret_cast(output.ptr()); + const uint32_t *const top_left_ptr = output_top_left + off; + + pixels.val[0] = vsubq_u32(pixels.val[0], vld1q_u32(top_left_ptr)); + vst1q_u32(outptr, pixels.val[0]); + + pixels.val[1] = vsubq_u32(pixels.val[1], vld1q_u32(top_left_ptr + 4)); + vst1q_u32(outptr + 4, pixels.val[1]); + + pixels.val[2] = vsubq_u32(pixels.val[2], vld1q_u32(top_left_ptr + 8)); + vst1q_u32(outptr + 8, pixels.val[2]); + + pixels.val[3] = vsubq_u32(pixels.val[3], vld1q_u32(top_left_ptr + 12)); + vst1q_u32(outptr + 12, pixels.val[3]); + + // Perform prefix summation + for(auto i = 0; i < 16; ++i) + { + outptr[i] += outptr[i - 1]; + } + }, + input, output); +} diff --git a/src/core/NEON/kernels/NELKTrackerKernel.cpp b/src/core/NEON/kernels/NELKTrackerKernel.cpp new file mode 100644 index 0000000000..3d2bfb204e --- /dev/null +++ b/src/core/NEON/kernels/NELKTrackerKernel.cpp @@ -0,0 +1,533 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include + +using namespace arm_compute; + +/** Constants used for Lucas-Kanade Algorithm */ +constexpr int W_BITS = 14; +constexpr float D0 = 1 << W_BITS; +constexpr float DETERMINANT_THRESHOLD = 1.0e-07f; // Threshold for the determinant. Used for lost tracking criteria +constexpr float EIGENVALUE_THRESHOLD = 1.0e-04f; // Thresholds for minimum eigenvalue. Used for lost tracking criteria +constexpr float FLT_SCALE = 1.0f / (1 << 20); + +namespace +{ +enum class BilinearInterpolation +{ + BILINEAR_OLD_NEW, + BILINEAR_SCHARR +}; + +template +constexpr int INT_ROUND(T x, int n) +{ + return (x + (1 << (n - 1))) >> n; +} + +template +inline int get_pixel(const ITensor *tensor, int xi, int yi, int iw00, int iw01, int iw10, int iw11, int scale) +{ + const auto px00 = *reinterpret_cast(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi, yi))); + const auto px01 = *reinterpret_cast(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi + 1, yi))); + const auto px10 = *reinterpret_cast(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi, yi + 1))); + const auto px11 = *reinterpret_cast(tensor->buffer() + tensor->info()->offset_element_in_bytes(Coordinates(xi + 1, yi + 1))); + + return INT_ROUND(px00 * iw00 + px01 * iw01 + px10 * iw10 + px11 * iw11, scale); +} + +inline int32x4_t compute_bilinear_interpolation(int16x8_t top_row, int16x8_t bottom_row, int16x4_t w00, int16x4_t w01, int16x4_t w10, int16x4_t w11, int32x4_t shift) +{ + // Get the left column of upper row + const int16x4_t px00 = vget_low_s16(top_row); + + // Get the right column of upper row + const int16x4_t px01 = vext_s16(px00, vget_high_s16(top_row), 1); + + // Get the left column of lower row + const int16x4_t px10 = vget_low_s16(bottom_row); + + // Get the right column of right row + const int16x4_t px11 = vext_s16(px10, vget_high_s16(bottom_row), 1); + + // Apply the bilinear filter + return vqrshlq_s32(vmull_s16(px00, w00) + vmull_s16(px01, w01) + vmull_s16(px10, w10) + vmull_s16(px11, w11), shift); +} +} // namespace + +void NELKTrackerKernel::init_keypoints(int start, int end) +{ + if(_level == _num_levels - 1) + { + const float level_scale = pow(_pyramid_scale, _level); + + for(int i = start; i < end; ++i) + { + _old_points_internal->at(i).x = _old_points->at(i).x * level_scale; + _old_points_internal->at(i).y = _old_points->at(i).y * level_scale; + _old_points_internal->at(i).tracking_status = true; + + NELKInternalKeypoint keypoint_to_track; + + if(_use_initial_estimate) + { + keypoint_to_track.x = _new_points_estimates->at(i).x * level_scale; + keypoint_to_track.y = _new_points_estimates->at(i).y * level_scale; + keypoint_to_track.tracking_status = (_new_points_estimates->at(i).tracking_status == 1); + } + else + { + keypoint_to_track.x = _old_points_internal->at(i).x; + keypoint_to_track.y = _old_points_internal->at(i).y; + keypoint_to_track.tracking_status = true; + } + + _new_points_internal->at(i) = keypoint_to_track; + } + } + else + { + for(int i = start; i < end; ++i) + { + _old_points_internal->at(i).x /= _pyramid_scale; + _old_points_internal->at(i).y /= _pyramid_scale; + _new_points_internal->at(i).x /= _pyramid_scale; + _new_points_internal->at(i).y /= _pyramid_scale; + } + } +} + +std::tuple NELKTrackerKernel::compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy) +{ + int iA11 = 0; + int iA12 = 0; + int iA22 = 0; + + int32x4_t nA11 = vdupq_n_s32(0); + int32x4_t nA12 = vdupq_n_s32(0); + int32x4_t nA22 = vdupq_n_s32(0); + + float keypoint_int_x = 0; + float keypoint_int_y = 0; + + const float wx = std::modf(keypoint.x, &keypoint_int_x); + const float wy = std::modf(keypoint.y, &keypoint_int_y); + + const int iw00 = roundf((1.0f - wx) * (1.0f - wy) * D0); + const int iw01 = roundf(wx * (1.0f - wy) * D0); + const int iw10 = roundf((1.0f - wx) * wy * D0); + const int iw11 = D0 - iw00 - iw01 - iw10; + + const int16x4_t nw00 = vdup_n_s16(iw00); + const int16x4_t nw01 = vdup_n_s16(iw01); + const int16x4_t nw10 = vdup_n_s16(iw10); + const int16x4_t nw11 = vdup_n_s16(iw11); + + // Convert stride from uint_t* to int16_t* + const size_t row_stride = _old_scharr_gx->info()->strides_in_bytes()[1] / 2; + const Coordinates top_left_window_corner(static_cast(keypoint_int_x) - _window_dimension / 2, static_cast(keypoint_int_y) - _window_dimension / 2); + auto idx = reinterpret_cast(_old_scharr_gx->buffer() + _old_scharr_gx->info()->offset_element_in_bytes(top_left_window_corner)); + auto idy = reinterpret_cast(_old_scharr_gy->buffer() + _old_scharr_gy->info()->offset_element_in_bytes(top_left_window_corner)); + static const int32x4_t nshifter_scharr = vdupq_n_s32(-W_BITS); + + for(int ky = 0; ky < _window_dimension; ++ky, idx += row_stride, idy += row_stride) + { + int kx = 0; + + // Calculate elements in blocks of four as long as possible + for(; kx <= _window_dimension - 4; kx += 4) + { + // Interpolation X + const int16x8_t ndx_row1 = vld1q_s16(idx + kx); + const int16x8_t ndx_row2 = vld1q_s16(idx + kx + row_stride); + + const int32x4_t nxval = compute_bilinear_interpolation(ndx_row1, ndx_row2, nw00, nw01, nw10, nw11, nshifter_scharr); + + // Interpolation Y + const int16x8_t ndy_row1 = vld1q_s16(idy + kx); + const int16x8_t ndy_row2 = vld1q_s16(idy + kx + row_stride); + + const int32x4_t nyval = compute_bilinear_interpolation(ndy_row1, ndy_row2, nw00, nw01, nw10, nw11, nshifter_scharr); + + // Store the intermediate data so that we don't need to recalculate them in later stage + vst1q_s32(bilinear_ix + kx + ky * _window_dimension, nxval); + vst1q_s32(bilinear_iy + kx + ky * _window_dimension, nyval); + + // Accumulate Ix^2 + nA11 = vmlaq_s32(nA11, nxval, nxval); + // Accumulate Ix * Iy + nA12 = vmlaq_s32(nA12, nxval, nyval); + // Accumulate Iy^2 + nA22 = vmlaq_s32(nA22, nyval, nyval); + } + + // Calculate the leftover elements + for(; kx < _window_dimension; ++kx) + { + const int32_t ixval = get_pixel(_old_scharr_gx, top_left_window_corner.x() + kx, top_left_window_corner.y() + ky, + iw00, iw01, iw10, iw11, W_BITS); + const int32_t iyval = get_pixel(_old_scharr_gy, top_left_window_corner.x() + kx, top_left_window_corner.y() + ky, + iw00, iw01, iw10, iw11, W_BITS); + + iA11 += ixval * ixval; + iA12 += ixval * iyval; + iA22 += iyval * iyval; + + bilinear_ix[kx + ky * _window_dimension] = ixval; + bilinear_iy[kx + ky * _window_dimension] = iyval; + } + } + + iA11 += vgetq_lane_s32(nA11, 0) + vgetq_lane_s32(nA11, 1) + vgetq_lane_s32(nA11, 2) + vgetq_lane_s32(nA11, 3); + iA12 += vgetq_lane_s32(nA12, 0) + vgetq_lane_s32(nA12, 1) + vgetq_lane_s32(nA12, 2) + vgetq_lane_s32(nA12, 3); + iA22 += vgetq_lane_s32(nA22, 0) + vgetq_lane_s32(nA22, 1) + vgetq_lane_s32(nA22, 2) + vgetq_lane_s32(nA22, 3); + + return std::make_tuple(iA11, iA12, iA22); +} + +std::pair NELKTrackerKernel::compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy) +{ + int ib1 = 0; + int ib2 = 0; + + int32x4_t nb1 = vdupq_n_s32(0); + int32x4_t nb2 = vdupq_n_s32(0); + + // Compute weights for the old keypoint + float old_keypoint_int_x = 0; + float old_keypoint_int_y = 0; + + const float old_wx = std::modf(old_keypoint.x, &old_keypoint_int_x); + const float old_wy = std::modf(old_keypoint.y, &old_keypoint_int_y); + + const int iw00_old = roundf((1.0f - old_wx) * (1.0f - old_wy) * D0); + const int iw01_old = roundf(old_wx * (1.0f - old_wy) * D0); + const int iw10_old = roundf((1.0f - old_wx) * old_wy * D0); + const int iw11_old = D0 - iw00_old - iw01_old - iw10_old; + + const int16x4_t nw00_old = vdup_n_s16(iw00_old); + const int16x4_t nw01_old = vdup_n_s16(iw01_old); + const int16x4_t nw10_old = vdup_n_s16(iw10_old); + const int16x4_t nw11_old = vdup_n_s16(iw11_old); + + // Compute weights for the new keypoint + float new_keypoint_int_x = 0; + float new_keypoint_int_y = 0; + + const float new_wx = std::modf(new_keypoint.x, &new_keypoint_int_x); + const float new_wy = std::modf(new_keypoint.y, &new_keypoint_int_y); + + const int iw00_new = roundf((1.0f - new_wx) * (1.0f - new_wy) * D0); + const int iw01_new = roundf(new_wx * (1.0f - new_wy) * D0); + const int iw10_new = roundf((1.0f - new_wx) * new_wy * D0); + const int iw11_new = D0 - iw00_new - iw01_new - iw10_new; + + const int16x4_t nw00_new = vdup_n_s16(iw00_new); + const int16x4_t nw01_new = vdup_n_s16(iw01_new); + const int16x4_t nw10_new = vdup_n_s16(iw10_new); + const int16x4_t nw11_new = vdup_n_s16(iw11_new); + + const int row_stride = _input_new->info()->strides_in_bytes()[1]; + const Coordinates top_left_window_corner_old(static_cast(old_keypoint_int_x) - _window_dimension / 2, static_cast(old_keypoint_int_y) - _window_dimension / 2); + const Coordinates top_left_window_corner_new(static_cast(new_keypoint_int_x) - _window_dimension / 2, static_cast(new_keypoint_int_y) - _window_dimension / 2); + const uint8_t *old_ptr = _input_old->buffer() + _input_old->info()->offset_element_in_bytes(top_left_window_corner_old); + const uint8_t *new_ptr = _input_new->buffer() + _input_new->info()->offset_element_in_bytes(top_left_window_corner_new); + static const int32x4_t nshifter_tensor = vdupq_n_s32(-(W_BITS - 5)); + + for(int ky = 0; ky < _window_dimension; ++ky, new_ptr += row_stride, old_ptr += row_stride) + { + int kx = 0; + + // Calculate elements in blocks of four as long as possible + for(; kx <= _window_dimension - 4; kx += 4) + { + // Interpolation old tensor + const int16x8_t nold_row1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(old_ptr + kx))); + const int16x8_t nold_row2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(old_ptr + kx + row_stride))); + + const int32x4_t noldval = compute_bilinear_interpolation(nold_row1, nold_row2, nw00_old, nw01_old, nw10_old, nw11_old, nshifter_tensor); + + // Interpolation new tensor + const int16x8_t nnew_row1 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(new_ptr + kx))); + const int16x8_t nnew_row2 = vreinterpretq_s16_u16(vmovl_u8(vld1_u8(new_ptr + kx + row_stride))); + + const int32x4_t nnewval = compute_bilinear_interpolation(nnew_row1, nnew_row2, nw00_new, nw01_new, nw10_new, nw11_new, nshifter_tensor); + + // Calculate It gradient, i.e. pixelwise difference between old and new tensor + const int32x4_t diff = vsubq_s32(nnewval, noldval); + + // Load the Ix and Iy gradient computed in the previous stage + const int32x4_t nxval = vld1q_s32(bilinear_ix + kx + ky * _window_dimension); + const int32x4_t nyval = vld1q_s32(bilinear_iy + kx + ky * _window_dimension); + + // Caculate Ix * It and Iy * It, and accumulate the results + nb1 = vmlaq_s32(nb1, diff, nxval); + nb2 = vmlaq_s32(nb2, diff, nyval); + } + + // Calculate the leftover elements + for(; kx < _window_dimension; ++kx) + { + const int32_t ival = get_pixel(_input_old, top_left_window_corner_old.x() + kx, top_left_window_corner_old.y() + ky, + iw00_old, iw01_old, iw10_old, iw11_old, W_BITS - 5); + const int32_t jval = get_pixel(_input_new, top_left_window_corner_new.x() + kx, top_left_window_corner_new.y() + ky, + iw00_new, iw01_new, iw10_new, iw11_new, W_BITS - 5); + + const int32_t diff = jval - ival; + + ib1 += diff * bilinear_ix[kx + ky * _window_dimension]; + ib2 += diff * bilinear_iy[kx + ky * _window_dimension]; + } + } + + ib1 += vgetq_lane_s32(nb1, 0) + vgetq_lane_s32(nb1, 1) + vgetq_lane_s32(nb1, 2) + vgetq_lane_s32(nb1, 3); + ib2 += vgetq_lane_s32(nb2, 0) + vgetq_lane_s32(nb2, 1) + vgetq_lane_s32(nb2, 2) + vgetq_lane_s32(nb2, 3); + + return std::make_pair(ib1, ib2); +} + +NELKTrackerKernel::NELKTrackerKernel() + : _input_old(nullptr), _input_new(nullptr), _old_scharr_gx(nullptr), _old_scharr_gy(nullptr), _new_points(nullptr), _new_points_estimates(nullptr), _old_points(nullptr), _old_points_internal(), + _new_points_internal(), _termination(Termination::TERM_CRITERIA_EPSILON), _use_initial_estimate(false), _pyramid_scale(0.0f), _epsilon(0.0f), _num_iterations(0), _window_dimension(0), _level(0), + _num_levels(0), _valid_region() +{ +} + +BorderSize NELKTrackerKernel::border_size() const +{ + return BorderSize(1); +} + +void NELKTrackerKernel::configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy, + const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points, + INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal, + Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension, + size_t level, size_t num_levels, float pyramid_scale) + +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_old, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input_new, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gx, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(old_scharr_gy, 1, DataType::S16); + + _input_old = input_old; + _input_new = input_new; + _old_scharr_gx = old_scharr_gx; + _old_scharr_gy = old_scharr_gy; + _old_points = old_points; + _new_points_estimates = new_points_estimates; + _new_points = new_points; + _old_points_internal = old_points_internal; + _new_points_internal = new_points_internal; + _termination = termination; + _use_initial_estimate = use_initial_estimate; + _epsilon = epsilon; + _num_iterations = num_iterations; + _window_dimension = window_dimension; + _level = level; + _num_levels = num_levels; + _pyramid_scale = pyramid_scale; + _num_levels = num_levels; + + Window window; + window.set(Window::DimX, Window::Dimension(0, old_points->num_values())); + window.set(Window::DimY, Window::Dimension(0, 1)); + + _valid_region = intersect_valid_regions( + input_old->info()->valid_region(), + input_new->info()->valid_region(), + old_scharr_gx->info()->valid_region(), + old_scharr_gy->info()->valid_region()); + + update_window_and_padding(window, + AccessWindowStatic(input_old->info(), _valid_region.start(0), _valid_region.start(1), + _valid_region.end(0), _valid_region.end(1)), + AccessWindowStatic(input_new->info(), _valid_region.start(0), _valid_region.start(1), + _valid_region.end(0), _valid_region.end(1)), + AccessWindowStatic(old_scharr_gx->info(), _valid_region.start(0), _valid_region.start(1), + _valid_region.end(0), _valid_region.end(1)), + AccessWindowStatic(old_scharr_gy->info(), _valid_region.start(0), _valid_region.start(1), + _valid_region.end(0), _valid_region.end(1))); + + INEKernel::configure(window); +} + +void NELKTrackerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + ARM_COMPUTE_ERROR_ON(_input_old->buffer() == nullptr); + ARM_COMPUTE_ERROR_ON(_input_new->buffer() == nullptr); + ARM_COMPUTE_ERROR_ON(_old_scharr_gx->buffer() == nullptr); + ARM_COMPUTE_ERROR_ON(_old_scharr_gy->buffer() == nullptr); + + const int list_end = window.x().end(); + const int list_start = window.x().start(); + + init_keypoints(list_start, list_end); + + const int buffer_size = _window_dimension * _window_dimension; + int bilinear_ix[buffer_size]; + int bilinear_iy[buffer_size]; + + const int half_window = _window_dimension / 2; + + auto is_invalid_keypoint = [&](const NELKInternalKeypoint & keypoint) + { + const int x = std::floor(keypoint.x); + const int y = std::floor(keypoint.y); + + return (x - half_window < _valid_region.start(0)) || (x + half_window >= _valid_region.end(0) - 1) || (y - half_window < _valid_region.start(1)) || (y + half_window >= _valid_region.end(1) - 1); + }; + + for(int list_indx = list_start; list_indx < list_end; ++list_indx) + { + NELKInternalKeypoint &old_keypoint = _old_points_internal->at(list_indx); + NELKInternalKeypoint &new_keypoint = _new_points_internal->at(list_indx); + + if(!old_keypoint.tracking_status) + { + continue; + } + + if(is_invalid_keypoint(old_keypoint)) + { + if(_level == 0) + { + new_keypoint.tracking_status = false; + } + + continue; + } + + // Compute spatial gradient matrix + int iA11 = 0; + int iA12 = 0; + int iA22 = 0; + + std::tie(iA11, iA12, iA22) = compute_spatial_gradient_matrix(old_keypoint, bilinear_ix, bilinear_iy); + + const float A11 = iA11 * FLT_SCALE; + const float A12 = iA12 * FLT_SCALE; + const float A22 = iA22 * FLT_SCALE; + + // Calculate minimum eigenvalue + const float sum_A11_A22 = A11 + A22; + const float discriminant = sum_A11_A22 * sum_A11_A22 - 4.0f * (A11 * A22 - A12 * A12); + // Divide by _window_dimension^2 to reduce the floating point accummulation error + const float minimum_eigenvalue = (sum_A11_A22 - std::sqrt(discriminant)) / (2.0f * _window_dimension * _window_dimension); + + // Determinant + const double D = A11 * A22 - A12 * A12; + + // Check if it is a good point to track + if(minimum_eigenvalue < EIGENVALUE_THRESHOLD || D < DETERMINANT_THRESHOLD) + { + // Invalidate tracked point + if(_level == 0) + { + new_keypoint.tracking_status = false; + } + + continue; + } + + float prev_delta_x = 0.0f; + float prev_delta_y = 0.0f; + + for(unsigned int j = 0; j < _num_iterations || _termination == Termination::TERM_CRITERIA_EPSILON; ++j) + { + if(is_invalid_keypoint(new_keypoint)) + { + if(_level == 0) + { + new_keypoint.tracking_status = false; + } + + break; + } + + // Compute image mismatch vector + int ib1 = 0; + int ib2 = 0; + + std::tie(ib1, ib2) = compute_image_mismatch_vector(old_keypoint, new_keypoint, bilinear_ix, bilinear_iy); + + double b1 = ib1 * FLT_SCALE; + double b2 = ib2 * FLT_SCALE; + + // Compute motion vector -> A^-1 * -b + const float delta_x = (A12 * b2 - A22 * b1) / D; + const float delta_y = (A12 * b1 - A11 * b2) / D; + + // Update the new position + new_keypoint.x += delta_x; + new_keypoint.y += delta_y; + + const float mag2 = delta_x * delta_x + delta_y * delta_y; + + // Check if termination criteria is EPSILON and if it is satisfied + if(mag2 <= _epsilon && (_termination == Termination::TERM_CRITERIA_EPSILON || _termination == Termination::TERM_CRITERIA_BOTH)) + { + break; + } + + // Check convergence analyzing the previous delta + if(j > 0 && std::fabs(delta_x + prev_delta_x) < 0.01f && std::fabs(delta_y + prev_delta_y) < 0.01f) + { + new_keypoint.x -= delta_x * _pyramid_scale; + new_keypoint.y -= delta_y * _pyramid_scale; + break; + } + + prev_delta_x = delta_x; + prev_delta_y = delta_y; + } + } + + if(_level == 0) + { + for(int list_indx = list_start; list_indx < list_end; ++list_indx) + { + const NELKInternalKeypoint &new_keypoint = _new_points_internal->at(list_indx); + + _new_points->at(list_indx).x = roundf(new_keypoint.x); + _new_points->at(list_indx).y = roundf(new_keypoint.y); + _new_points->at(list_indx).tracking_status = new_keypoint.tracking_status ? 1 : 0; + } + } +} diff --git a/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..ab84efbf23 --- /dev/null +++ b/src/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +void vector_matrix_multiply_f32(const ITensor *input0, const ITensor *input1, ITensor *output, const Window &window) +{ + const auto width_matrix_b = static_cast(output->info()->dimension(0)); + const auto in_b_stride = static_cast(input1->info()->strides_in_bytes()[1] / data_size_from_type(input1->info()->data_type())); + const auto num_elems_vec_a = static_cast(input0->info()->dimension(0)); + + // The implementation computes 16 elements per iteration + const int window_start_x = 16 * window.thread_id(); + const int window_step_x = 16 * window.num_threads(); + // Make sure (window_end_x - window_start_x) is a multiple of window_step_x + const int window_end_x = ceil_to_multiple(width_matrix_b - window_start_x, window_step_x) + window_start_x; + + Window win_out(window); + win_out.set(Window::DimX, Window::Dimension(window_start_x, window_end_x, window_step_x)); + + Window win_a(window); + win_a.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator ina(input0, win_a); + Iterator out(output, win_out); + + execute_window_loop(win_out, [&](const Coordinates & id) + { + if(id.x() > width_matrix_b) + { + return; + } + + float32x4_t acc0 = vdupq_n_f32(0.f); + float32x4_t acc1 = vdupq_n_f32(0.f); + float32x4_t acc2 = vdupq_n_f32(0.f); + float32x4_t acc3 = vdupq_n_f32(0.f); + + auto vec_a = reinterpret_cast(ina.ptr()); + auto matrix_b = reinterpret_cast(input1->ptr_to_element(Coordinates(id[0], 0, id[1]))); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b))); + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(matrix_b + in_b_stride))); +#endif + + const float *vec_a_end_addr = vec_a + num_elems_vec_a; + + for(; vec_a <= (vec_a_end_addr - 4);) + { + float32x2_t a0l = vld1_f32(vec_a); + + float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + float32x4_t b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + float32x4_t b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + float32x4_t b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + float32x4_t b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + +#if __arm__ + asm volatile("PLD [%0, #128*4]" ::"r"(reinterpret_cast(vec_a))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 1 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 2 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 3 * in_b_stride))); + asm volatile("PLD [%0, #128*1]" ::"r"(reinterpret_cast(matrix_b + 4 * in_b_stride))); +#endif + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + + a0l = vld1_f32(vec_a); + + b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + b10 = vld1q_f32(matrix_b + 0 + 1 * in_b_stride); + b11 = vld1q_f32(matrix_b + 4 + 1 * in_b_stride); + b12 = vld1q_f32(matrix_b + 8 + 1 * in_b_stride); + b13 = vld1q_f32(matrix_b + 12 + 1 * in_b_stride); + + acc0 = vmlaq_lane_f32(acc0, b00, a0l, 0); + acc1 = vmlaq_lane_f32(acc1, b01, a0l, 0); + acc2 = vmlaq_lane_f32(acc2, b02, a0l, 0); + acc3 = vmlaq_lane_f32(acc3, b03, a0l, 0); + + acc0 = vmlaq_lane_f32(acc0, b10, a0l, 1); + acc1 = vmlaq_lane_f32(acc1, b11, a0l, 1); + acc2 = vmlaq_lane_f32(acc2, b12, a0l, 1); + acc3 = vmlaq_lane_f32(acc3, b13, a0l, 1); + + vec_a += 2; + matrix_b += 2 * in_b_stride; + } + + for(; vec_a < vec_a_end_addr;) + { + const float a0 = *vec_a; + + const float32x4_t b00 = vld1q_f32(matrix_b + 0 + 0 * in_b_stride); + const float32x4_t b01 = vld1q_f32(matrix_b + 4 + 0 * in_b_stride); + const float32x4_t b02 = vld1q_f32(matrix_b + 8 + 0 * in_b_stride); + const float32x4_t b03 = vld1q_f32(matrix_b + 12 + 0 * in_b_stride); + + acc0 = vmlaq_n_f32(acc0, b00, a0); + acc1 = vmlaq_n_f32(acc1, b01, a0); + acc2 = vmlaq_n_f32(acc2, b02, a0); + acc3 = vmlaq_n_f32(acc3, b03, a0); + + vec_a += 1; + matrix_b += in_b_stride; + } + + const auto vec_out = reinterpret_cast(out.ptr()); + + vst1q_f32(vec_out + 0, acc0); + vst1q_f32(vec_out + 4, acc1); + vst1q_f32(vec_out + 8, acc2); + vst1q_f32(vec_out + 12, acc3); + }, + ina, out); +} +} // namespace + +NELocallyConnectedMatrixMultiplyKernel::NELocallyConnectedMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void NELocallyConnectedMatrixMultiplyKernel::configure(const ITensor *input0, const ITensor *input1, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + + _input0 = input0; + _input1 = input1; + _output = output; + + unsigned int num_elems_processed_per_iteration_x = 16; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x); + + update_window_and_padding(win, + AccessWindowHorizontal(input0->info(), 0, num_elems_processed_per_iteration_x), + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration_x), + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NELocallyConnectedMatrixMultiplyKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + vector_matrix_multiply_f32(_input0, _input1, _output, window); +} diff --git a/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp new file mode 100644 index 0000000000..a874d219d7 --- /dev/null +++ b/src/core/NEON/kernels/NEMagnitudePhaseKernel.cpp @@ -0,0 +1,869 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +// Defines for computing atan2 +constexpr float SCALE_FACTOR = 0.7111111111111111f; +constexpr float PI = 3.141592653589793f; +constexpr float SCALE_180 = 180.0f / PI; +constexpr float SCALE_360 = SCALE_180 * SCALE_FACTOR; +constexpr float PI_4 = 0.7853981633974483f; +constexpr float COEFF1 = 0.0663f; +constexpr float COEFF2 = 0.2447f; +} // namespace + +#ifdef ARM_COMPUTE_ENABLE_FP16 +namespace fp16 +{ +inline float16x8_t inv(float16x8_t x) +{ + const float16x8_t estimate = vrecpeq_f16(x); + return vmulq_f16(estimate, vrecpsq_f16(x, estimate)); +} + +inline float16x8_t atan2_fast(float16x8_t gx, float16x8_t gy, float16x8_t scale) +{ + static const float16x8_t one = vdupq_n_f16(1.0f); + static const float16x8_t ninety = vdupq_n_f16(90.f * SCALE_FACTOR); + static const float16x8_t epsilon = vdupq_n_f16(1e-9f); + static const float16x8_t piover4 = vdupq_n_f16(PI_4); + static const float16x8_t coeff1 = vdupq_n_f16(COEFF1); + static const float16x8_t coeff2 = vdupq_n_f16(COEFF2); + + const float16x8_t abs_gx = vabsq_f16(gx); + const float16x8_t abs_gy = vabsq_f16(gy); + const float16x8_t tmin = vminq_f16(abs_gx, abs_gy); + const float16x8_t tmax = vmaxq_f16(abs_gx, abs_gy); + + // z = min(x, y) / max(x, y) + const float16x8_t z = vmulq_f16(tmin, inv(vaddq_f16(tmax, epsilon))); + const float16x8_t absz = vabsq_f16(z); + + // = x * [pi/4 + (1 - |x|) * (0.2447 + 0.0663 * |x|)] + float16x8_t arctan = vmulq_f16(z, vfmaq_f16(piover4, + vsubq_f16(one, absz), + vfmaq_f16(coeff2, coeff1, absz))); + + // Radians to degrees conversion with applied a scale factor in order to have the result [0, 255] + arctan = vmulq_f16(arctan, scale); + + /* If z > 1, result = 90 - result */ + return vbslq_f16(vcgeq_f16(abs_gx, abs_gy), arctan, vsubq_f16(ninety, arctan)); +} + +inline float16x8_t atan2_0_360(float16x8_t gx, float16x8_t gy) +{ + static const float16x8_t scale = vdupq_n_f16(SCALE_360); + static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR); + static const float16x8_t zero = vdupq_n_f16(0.0f); + static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR); + + float16x8_t arctan = atan2_fast(gx, gy, scale); + + // Choose correct quadrant + arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan); + arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan); + + return arctan; +} + +inline float16x8_t atan2_0_180(float16x8_t gx, float16x8_t gy) +{ + static const float16x8_t scale = vdupq_n_f16(SCALE_180); + static const float16x8_t threesixty = vdupq_n_f16(360.0f * SCALE_FACTOR); + static const float16x8_t oneeighty = vdupq_n_f16(180.0f * SCALE_FACTOR); + static const float16x8_t zero = vdupq_n_f16(0.0f); + + float16x8_t arctan = atan2_fast(gx, gy, scale); + + // Choose correct quadrant + arctan = vbslq_f16(vcltq_f16(gx, zero), vsubq_f16(oneeighty, arctan), arctan); + arctan = vbslq_f16(vcltq_f16(gy, zero), vsubq_f16(threesixty, arctan), arctan); + arctan = vbslq_f16(vcgtq_f16(arctan, oneeighty), vsubq_f16(arctan, oneeighty), arctan); + + return arctan; +} + +inline float32x4_t invsqrtv(float32x4_t x) +{ + float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); + + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), + sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), + sqrt_reciprocal); + + return sqrt_reciprocal; +} + +inline float32x4_t sqrtv(float32x4_t x) +{ + float32x4_t res = vdupq_n_f32(0.5f); + return vmlaq_f32(res, x, invsqrtv(x)); +} + +inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2) +{ + return vqaddq_s16(vabsq_s16(input1), vabsq_s16(input2)); +} + +inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2) +{ + const int32x4x2_t square_x = + { + vmull_s16(vget_low_s16(input1), vget_low_s16(input1)), + vmull_s16(vget_high_s16(input1), vget_high_s16(input1)) + }; + + const int32x4x2_t square_y = + { + vmull_s16(vget_low_s16(input2), vget_low_s16(input2)), + vmull_s16(vget_high_s16(input2), vget_high_s16(input2)) + }; + + const uint32x4x2_t sum = + { + vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), + vreinterpretq_u32_s32(square_y.val[0])), + vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), + vreinterpretq_u32_s32(square_y.val[1])) + }; + + const float32x4x2_t res = + { + sqrtv(vcvtq_f32_u32(sum.val[0])), + sqrtv(vcvtq_f32_u32(sum.val[1])) + }; + + return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])), + vqmovn_s32(vcvtq_s32_f32(res.val[1]))); +} + +inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2) +{ + static const float16x8_t zeropointfive = vdupq_n_f16(0.5f); + + const float16x8_t inputx_f16 = vcvtq_f16_s16(input1); + const float16x8_t inputy_f16 = vcvtq_f16_s16(input2); + + // Compute fast atan2 + const float16x8_t angle = atan2_0_360(inputx_f16, inputy_f16); + + return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive))); +} + +inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2) +{ + static const float16x8_t zeropointfive = vdupq_n_f16(0.5f); + + const float16x8_t inputx_f16 = vcvtq_f16_s16(input1); + const float16x8_t inputy_f16 = vcvtq_f16_s16(input2); + + // Compute fast atan2 + const float16x8_t angle = atan2_0_180(inputx_f16, inputy_f16); + + return vqmovun_s16(vcvtq_s16_f16(vaddq_f16(angle, zeropointfive))); +} + +template +inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx); + +template <> +inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx) +{ + const int16x8x2_t mag = + { + magnitude_l2(in0.val[0], gx.val[0]), + magnitude_l2(in0.val[1], gx.val[1]) + }; + + return mag; +} + +template <> +inline int16x8x2_t compute_magnitude(const int16x8x2_t &in0, const int16x8x2_t &gx) +{ + const int16x8x2_t mag = + { + magnitude_l1(in0.val[0], gx.val[0]), + magnitude_l1(in0.val[1], gx.val[1]) + }; + + return mag; +} + +template +inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx); + +template <> +inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx) +{ + return vcombine_u8(phase_signed(in0.val[0], gx.val[0]), + phase_signed(in0.val[1], gx.val[1])); +} + +template <> +inline uint8x16_t compute_phase(const int16x8x2_t &in0, const int16x8x2_t &gx) +{ + return vcombine_u8(phase_unsigned(in0.val[0], gx.val[0]), + phase_unsigned(in0.val[1], gx.val[1])); +} +} // namespace fp16 + +template +NEMagnitudePhaseFP16Kernel::NEMagnitudePhaseFP16Kernel() + : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr) +{ +} + +template +void NEMagnitudePhaseFP16Kernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gx, Format::S16); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(gy, Format::S16); + ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase)); + + const bool run_mag = magnitude != nullptr; + const bool run_phase = phase != nullptr; + + if(run_mag) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(magnitude, Format::S16); + } + + if(run_phase) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(phase, Format::U8); + } + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + if(run_mag && run_phase) + { + /* Run magnitude and phase */ + _func = &NEMagnitudePhaseFP16Kernel::magnitude_phase; + } + else if(run_mag) + { + /* Run magnitude */ + _func = &NEMagnitudePhaseFP16Kernel::magnitude; + } + else if(run_phase) + { + /* Run phase */ + _func = &NEMagnitudePhaseFP16Kernel::phase; + } + else + { + ARM_COMPUTE_ERROR("At least one output must be NOT NULL"); + } + + const unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration), + magnitude_access, + phase_access); + + ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(), + gy->info()->valid_region()); + + magnitude_access.set_valid_region(win, valid_region); + phase_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +template +void NEMagnitudePhaseFP16Kernel::magnitude(const Window &window) +{ + Iterator gx(_gx, window); + Iterator gy(_gy, window); + Iterator magnitude(_magnitude, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t input1 = + { + vld1q_s16(reinterpret_cast(gx.ptr())), + vld1q_s16(reinterpret_cast(gx.ptr()) + 8) + }; + + const int16x8x2_t input2 = + { + vld1q_s16(reinterpret_cast(gy.ptr())), + vld1q_s16(reinterpret_cast(gy.ptr()) + 8) + }; + + // Compute and store magnitude + const int16x8x2_t mag = fp16::compute_magnitude(input1, input2); + + /* Store magnitude */ + vst1q_s16(reinterpret_cast(magnitude.ptr()), mag.val[0]); + vst1q_s16(reinterpret_cast(magnitude.ptr()) + 8, mag.val[1]); + }, + gx, gy, magnitude); +} + +template +void NEMagnitudePhaseFP16Kernel::phase(const Window &window) +{ + Iterator gx(_gx, window); + Iterator gy(_gy, window); + Iterator phase(_phase, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t input1 = + { + vld1q_s16(reinterpret_cast(gx.ptr())), + vld1q_s16(reinterpret_cast(gx.ptr()) + 8) + }; + + const int16x8x2_t input2 = + { + vld1q_s16(reinterpret_cast(gy.ptr())), + vld1q_s16(reinterpret_cast(gy.ptr()) + 8) + }; + + // Compute and store phase + vst1q_u8(phase.ptr(), fp16::compute_phase(input1, input2)); + }, + gx, gy, phase); +} + +template +void NEMagnitudePhaseFP16Kernel::magnitude_phase(const Window &window) +{ + Iterator gx(_gx, window); + Iterator gy(_gy, window); + Iterator magnitude(_magnitude, window); + Iterator phase(_phase, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t input1 = + { + vld1q_s16(reinterpret_cast(gx.ptr())), + vld1q_s16(reinterpret_cast(gx.ptr()) + 8) + }; + + const int16x8x2_t input2 = + { + vld1q_s16(reinterpret_cast(gy.ptr())), + vld1q_s16(reinterpret_cast(gy.ptr()) + 8) + }; + + // Compute and store magnitude + const int16x8x2_t mag = fp16::compute_magnitude(input1, input2); + + vst1q_s16(reinterpret_cast(magnitude.ptr()), mag.val[0]); + vst1q_s16(reinterpret_cast(magnitude.ptr()) + 8, mag.val[1]); + + // Compute and store phase + vst1q_u8(phase.ptr(), fp16::compute_phase(input1, input2)); + }, + gx, gy, magnitude, phase); +} + +template +void NEMagnitudePhaseFP16Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} + +template class arm_compute::NEMagnitudePhaseFP16Kernel; +template class arm_compute::NEMagnitudePhaseFP16Kernel; +template class arm_compute::NEMagnitudePhaseFP16Kernel; +template class arm_compute::NEMagnitudePhaseFP16Kernel; +#endif + +namespace +{ +inline float32x4_t inv(float32x4_t x) +{ + float32x4_t result = vrecpeq_f32(x); + result = vmulq_f32(vrecpsq_f32(x, result), result); + return result; +} + +inline float32x4_t atan2_0_360(float32x4_t gx, float32x4_t gy) +{ + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t epsilon = vdupq_n_f32(1e-9f); + const float32x4_t piover4 = vdupq_n_f32(PI_4); + const float32x4_t coeff1 = vdupq_n_f32(COEFF1); + const float32x4_t coeff2 = vdupq_n_f32(COEFF2); + const float32x4_t ninety = vdupq_n_f32(90.0f * SCALE_FACTOR); + const float32x4_t oneeighty = vdupq_n_f32(180.0f * SCALE_FACTOR); + const float32x4_t threesixty = vdupq_n_f32(360.0f * SCALE_FACTOR); + const float32x4_t scale = vdupq_n_f32(SCALE_360); + + float32x4_t abs_gx = vabsq_f32(gx); + float32x4_t abs_gy = vabsq_f32(gy); + float32x4_t tmin = vminq_f32(abs_gx, abs_gy); + float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy); + float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon))); + float32x4_t absz = vabsq_f32(z); + float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz)); + + /* Compute y = pi/4 * x - x*(abs(x)-1)*(0.2447+0.0663 * abs(x) */ + float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1)); + result = vmulq_f32(result, term); + result = vmlaq_f32(result, piover4, z); + + /* Radians to degrees conversion with applied a scale factor in order to have the result [0, 255] */ + result = vmulq_f32(result, scale); + + /* If z > 1, result = 90 - result */ + result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result)); + + /* Choose correct quadrant */ + result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result); + result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result); + + return result; +} + +inline float32x4_t atan2_0_180(float32x4_t gx, float32x4_t gy) +{ + const float32x4_t zero = vdupq_n_f32(0.0f); + const float32x4_t epsilon = vdupq_n_f32(1e-9f); // epsilon used to avoiding division by 0 + const float32x4_t piover4 = vdupq_n_f32(PI_4); + const float32x4_t coeff1 = vdupq_n_f32(COEFF1); + const float32x4_t coeff2 = vdupq_n_f32(COEFF2); + const float32x4_t ninety = vdupq_n_f32(90.0f); + const float32x4_t oneeighty = vdupq_n_f32(180.0f); + const float32x4_t threesixty = vdupq_n_f32(360.0f); + const float32x4_t scale = vdupq_n_f32(SCALE_180); + + float32x4_t abs_gx = vabsq_f32(gx); + float32x4_t abs_gy = vabsq_f32(gy); + float32x4_t tmin = vminq_f32(abs_gx, abs_gy); + float32x4_t tmax = vmaxq_f32(abs_gx, abs_gy); + float32x4_t z = vmulq_f32(tmin, inv(vaddq_f32(tmax, epsilon))); + float32x4_t absz = vabsq_f32(z); + + /* Compute y = pi/4 * z - z*(abs(z)-1)*(0.2447+0.0663 * abs(z) */ + float32x4_t term = vmulq_f32(z, vsubq_f32(vdupq_n_f32(1.0f), absz)); + float32x4_t result = vaddq_f32(coeff2, vmulq_f32(absz, coeff1)); + result = vmulq_f32(result, term); + result = vmlaq_f32(result, piover4, z); + + /* Radians to degrees conversion */ + result = vmulq_f32(result, scale); + + /* If z > 1, result = 90 - result */ + result = vbslq_f32(vcgeq_f32(abs_gx, abs_gy), result, vsubq_f32(ninety, result)); + + /* Choose correct quadrant */ + result = vbslq_f32(vcltq_f32(gx, zero), vsubq_f32(oneeighty, result), result); + result = vbslq_f32(vcltq_f32(gy, zero), vsubq_f32(threesixty, result), result); + result = vbslq_f32(vcgtq_f32(result, oneeighty), vsubq_f32(result, oneeighty), result); + + return result; +} + +inline float32x4_t invsqrtv(float32x4_t x) +{ + float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); + + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), + sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), + sqrt_reciprocal); + + return sqrt_reciprocal; +} + +inline float32x4_t sqrtv(float32x4_t x) +{ + float32x4_t res = vdupq_n_f32(0.5f); + return vmlaq_f32(res, x, invsqrtv(x)); +} + +inline int16x8_t magnitude_l2(int16x8_t input1, int16x8_t input2) +{ + const int32x4x2_t square_x = + { + { + vmull_s16(vget_low_s16(input1), vget_low_s16(input1)), + vmull_s16(vget_high_s16(input1), vget_high_s16(input1)) + } + }; + + const int32x4x2_t square_y = + { + { + vmull_s16(vget_low_s16(input2), vget_low_s16(input2)), + vmull_s16(vget_high_s16(input2), vget_high_s16(input2)) + } + }; + + const uint32x4x2_t sum = + { + { + vaddq_u32(vreinterpretq_u32_s32(square_x.val[0]), vreinterpretq_u32_s32(square_y.val[0])), + vaddq_u32(vreinterpretq_u32_s32(square_x.val[1]), vreinterpretq_u32_s32(square_y.val[1])) + } + }; + + const float32x4x2_t res = + { + { + sqrtv(vcvtq_f32_u32(sum.val[0])), + sqrtv(vcvtq_f32_u32(sum.val[1])) + } + }; + + return vcombine_s16(vqmovn_s32(vcvtq_s32_f32(res.val[0])), + vqmovn_s32(vcvtq_s32_f32(res.val[1]))); +} + +inline int16x8_t magnitude_l1(int16x8_t input1, int16x8_t input2) +{ + int16x8_t gx_abs = vabsq_s16(input1); + int16x8_t gy_abs = vabsq_s16(input2); + + /* Saturating add */ + return vqaddq_s16(gx_abs, gy_abs); +} + +inline uint8x8_t phase_signed(int16x8_t input1, int16x8_t input2) +{ + const float32x4_t zeropointfive = vdupq_n_f32(0.5f); + + float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1))); + float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1))); + float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2))); + float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2))); + + /* Compute fast atan2 */ + float32x4_t angle_high = atan2_0_360(inputx_f32_high, inputy_f32_high); + float32x4_t angle_low = atan2_0_360(inputx_f32_low, inputy_f32_low); + + angle_high = vaddq_f32(angle_high, zeropointfive); + angle_low = vaddq_f32(angle_low, zeropointfive); + + return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)), + vqmovun_s32(vcvtq_s32_f32(angle_high)))); +} + +inline uint8x8_t phase_unsigned(int16x8_t input1, int16x8_t input2) +{ + const float32x4_t zeropointfive = vdupq_n_f32(0.5f); + + float32x4_t inputx_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input1))); + float32x4_t inputx_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input1))); + float32x4_t inputy_f32_high = vcvtq_f32_s32(vmovl_s16(vget_high_s16(input2))); + float32x4_t inputy_f32_low = vcvtq_f32_s32(vmovl_s16(vget_low_s16(input2))); + + /* Compute fast atan2 */ + float32x4_t angle_high = atan2_0_180(inputx_f32_high, inputy_f32_high); + float32x4_t angle_low = atan2_0_180(inputx_f32_low, inputy_f32_low); + + angle_high = vaddq_f32(angle_high, zeropointfive); + angle_low = vaddq_f32(angle_low, zeropointfive); + + return vmovn_u16(vcombine_u16(vqmovun_s32(vcvtq_s32_f32(angle_low)), + vqmovun_s32(vcvtq_s32_f32(angle_high)))); +} +} // namespace + +template +NEMagnitudePhaseKernel::NEMagnitudePhaseKernel() + : _func(nullptr), _gx(nullptr), _gy(nullptr), _magnitude(nullptr), _phase(nullptr) +{ +} + +template +void NEMagnitudePhaseKernel::configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gx, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(gy, 1, DataType::S16); + ARM_COMPUTE_ERROR_ON((nullptr == magnitude) && (nullptr == phase)); + + const bool run_mag = magnitude != nullptr; + const bool run_phase = phase != nullptr; + + if(run_mag) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(magnitude, 1, DataType::S16); + } + + if(run_phase) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(phase, 1, DataType::U8); + } + + _gx = gx; + _gy = gy; + _magnitude = magnitude; + _phase = phase; + + if(run_mag && run_phase) + { + /* Run magnitude and phase */ + _func = &NEMagnitudePhaseKernel::magnitude_phase; + } + else + { + if(run_mag) + { + /* Run magnitude */ + _func = &NEMagnitudePhaseKernel::magnitude; + } + else if(run_phase) + { + /* Run phase */ + _func = &NEMagnitudePhaseKernel::phase; + } + else + { + ARM_COMPUTE_ERROR("At least one output must be NOT NULL"); + } + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*gx->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal magnitude_access(magnitude == nullptr ? nullptr : magnitude->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal phase_access(phase == nullptr ? nullptr : phase->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(gx->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(gy->info(), 0, num_elems_processed_per_iteration), + magnitude_access, + phase_access); + + ValidRegion valid_region = intersect_valid_regions(gx->info()->valid_region(), + gy->info()->valid_region()); + + magnitude_access.set_valid_region(win, valid_region); + phase_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +template +void NEMagnitudePhaseKernel::magnitude(const Window &window) +{ + Iterator gx(_gx, window); + Iterator gy(_gy, window); + Iterator magnitude(_magnitude, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t input1 = + { + { + vld1q_s16(reinterpret_cast(gx.ptr())), + vld1q_s16(reinterpret_cast(gx.ptr()) + 8) + } + }; + + const int16x8x2_t input2 = + { + { + vld1q_s16(reinterpret_cast(gy.ptr())), + vld1q_s16(reinterpret_cast(gy.ptr()) + 8) + } + }; + + /* Compute magnitude */ + int16x8x2_t mag{ {} }; + + if(MagnitudeType::L2NORM == mag_type) + { + mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]); + mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]); + } + else + { + mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]); + mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]); + } + + /* Store magnitude */ + vst1q_s16(reinterpret_cast(magnitude.ptr()), mag.val[0]); + vst1q_s16(reinterpret_cast(magnitude.ptr()) + 8, mag.val[1]); + }, + gx, gy, magnitude); +} + +template +void NEMagnitudePhaseKernel::phase(const Window &window) +{ + Iterator gx(_gx, window); + Iterator gy(_gy, window); + Iterator phase(_phase, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t input1 = + { + { + vld1q_s16(reinterpret_cast(gx.ptr())), + vld1q_s16(reinterpret_cast(gx.ptr()) + 8) + } + }; + + const int16x8x2_t input2 = + { + { + vld1q_s16(reinterpret_cast(gy.ptr())), + vld1q_s16(reinterpret_cast(gy.ptr()) + 8) + } + }; + + /* Compute phase */ + uint8x8x2_t vphase{ {} }; + + if(PhaseType::SIGNED == phase_type) + { + vphase.val[0] = phase_signed(input1.val[0], input2.val[0]); + vphase.val[1] = phase_signed(input1.val[1], input2.val[1]); + } + else + { + vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]); + vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]); + } + + /* Store phase */ + vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1])); + }, + gx, gy, phase); +} + +template +void NEMagnitudePhaseKernel::magnitude_phase(const Window &window) +{ + Iterator gx(_gx, window); + Iterator gy(_gy, window); + Iterator magnitude(_magnitude, window); + Iterator phase(_phase, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const int16x8x2_t input1 = + { + { + vld1q_s16(reinterpret_cast(gx.ptr())), + vld1q_s16(reinterpret_cast(gx.ptr()) + 8) + } + }; + + const int16x8x2_t input2 = + { + { + vld1q_s16(reinterpret_cast(gy.ptr())), + vld1q_s16(reinterpret_cast(gy.ptr()) + 8) + } + }; + + /* Compute magnitude */ + int16x8x2_t mag{ {} }; + + if(MagnitudeType::L2NORM == mag_type) + { + mag.val[0] = magnitude_l2(input1.val[0], input2.val[0]); + mag.val[1] = magnitude_l2(input1.val[1], input2.val[1]); + } + else + { + mag.val[0] = magnitude_l1(input1.val[0], input2.val[0]); + mag.val[1] = magnitude_l1(input1.val[1], input2.val[1]); + } + + /* Store magnitude */ + vst1q_s16(reinterpret_cast(magnitude.ptr()), mag.val[0]); + vst1q_s16(reinterpret_cast(magnitude.ptr()) + 8, mag.val[1]); + + /* Compute phase */ + uint8x8x2_t vphase{ {} }; + + if(PhaseType::SIGNED == phase_type) + { + vphase.val[0] = phase_signed(input1.val[0], input2.val[0]); + vphase.val[1] = phase_signed(input1.val[1], input2.val[1]); + } + else + { + vphase.val[0] = phase_unsigned(input1.val[0], input2.val[0]); + vphase.val[1] = phase_unsigned(input1.val[1], input2.val[1]); + } + + /* Store phase */ + vst1q_u8(phase.ptr(), vcombine_u8(vphase.val[0], vphase.val[1])); + }, + gx, gy, magnitude, phase); +} + +template +void NEMagnitudePhaseKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} + +template class arm_compute::NEMagnitudePhaseKernel; +template class arm_compute::NEMagnitudePhaseKernel; +template class arm_compute::NEMagnitudePhaseKernel; +template class arm_compute::NEMagnitudePhaseKernel; diff --git a/src/core/NEON/kernels/NEMeanStdDevKernel.cpp b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp new file mode 100644 index 0000000000..4616203d66 --- /dev/null +++ b/src/core/NEON/kernels/NEMeanStdDevKernel.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +template +std::pair accumulate(const Window &window, Iterator &iterator) +{ + uint64x1_t sum = vdup_n_u64(0); + uint64x1_t sum_squared = vdup_n_u64(0); + + // Calculate sum + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t in_data = vld1q_u8(iterator.ptr()); + + // Sum of the low and high elements of data + const uint16x8_t tmp0 = vaddl_u8(vget_low_u8(in_data), vget_high_u8(in_data)); + const uint32x4_t tmp1 = vaddl_u16(vget_low_u16(tmp0), vget_high_u16(tmp0)); + const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1)); + + // Update sum + sum = vpadal_u32(sum, tmp2); + + if(calc_sum_squared) + { + const uint16x8_t square_data_low = vmull_u8(vget_low_u8(in_data), vget_low_u8(in_data)); + const uint16x8_t square_data_high = vmull_u8(vget_high_u8(in_data), vget_high_u8(in_data)); + + // Sum of the low and high elements of data + const uint32x4_t tmp0_low = vaddl_u16(vget_low_u16(square_data_low), vget_high_u16(square_data_low)); + const uint32x4_t tmp0_high = vaddl_u16(vget_low_u16(square_data_high), vget_high_u16(square_data_high)); + const uint32x4_t tmp1 = vaddq_u32(tmp0_low, tmp0_high); + const uint32x2_t tmp2 = vadd_u32(vget_low_u32(tmp1), vget_high_u32(tmp1)); + + // Update sum + sum_squared = vpadal_u32(sum_squared, tmp2); + } + }, + iterator); + + return std::make_pair(sum, sum_squared); +} +} // namespace + +NEMeanStdDevKernel::NEMeanStdDevKernel() + : _input(nullptr), _mean(nullptr), _stddev(nullptr), _global_sum(nullptr), _global_sum_squared(nullptr), _mtx() +{ +} + +void NEMeanStdDevKernel::configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev, uint64_t *global_sum_squared) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON(nullptr == mean); + ARM_COMPUTE_ERROR_ON(nullptr == global_sum); + ARM_COMPUTE_ERROR_ON(stddev && nullptr == global_sum_squared); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + + _input = input; + _mean = mean; + _stddev = stddev; + _global_sum = global_sum; + _global_sum_squared = global_sum_squared; + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + + INEKernel::configure(win); +} + +void NEMeanStdDevKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + Iterator input(_input, window); + + uint64x1_t local_sum = vdup_n_u64(0); + uint64x1_t local_sum_squared = vdup_n_u64(0); + + if(_stddev != nullptr) + { + std::tie(local_sum, local_sum_squared) = accumulate(window, input); + } + else + { + std::tie(local_sum, local_sum_squared) = accumulate(window, input); + } + + const float num_pixels = _input->info()->dimension(0) * _input->info()->dimension(1); + + // Merge sum and calculate mean and stddev + std::unique_lock lock(_mtx); + + *_global_sum += vget_lane_u64(local_sum, 0); + + const float mean = *_global_sum / num_pixels; + *_mean = mean; + + if(_stddev != nullptr) + { + const uint64_t tmp_sum_squared = vget_lane_u64(local_sum_squared, 0); + *_global_sum_squared += tmp_sum_squared; + *_stddev = std::sqrt((*_global_sum_squared / num_pixels) - (mean * mean)); + } + + lock.unlock(); +} diff --git a/src/core/NEON/kernels/NEMedian3x3Kernel.cpp b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp new file mode 100644 index 0000000000..601a0e109f --- /dev/null +++ b/src/core/NEON/kernels/NEMedian3x3Kernel.cpp @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace +{ +inline void sort(uint8x8_t &a, uint8x8_t &b) +{ + const uint8x8_t min = vmin_u8(a, b); + const uint8x8_t max = vmax_u8(a, b); + a = min; + b = max; +} +} // namespace + +BorderSize NEMedian3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void NEMedian3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + _input = input; + _output = output; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + constexpr int rect_offset_xy = -1; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), rect_offset_xy, rect_offset_xy, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NEMedian3x3Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + + const unsigned char *input_bot_ptr = _input->ptr_to_element(Coordinates(-1, -1)); + const unsigned char *input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0)); + const unsigned char *input_top_ptr = _input->ptr_to_element(Coordinates(-1, +1)); + + Iterator input(_input, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + uint8x8_t p0 = vget_low_u8(top_data); + uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1); + uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2); + uint8x8_t p3 = vget_low_u8(mid_data); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p6 = vget_low_u8(bot_data); + uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1); + uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2); + + sort(p1, p2); + sort(p4, p5); + sort(p7, p8); + + sort(p0, p1); + sort(p3, p4); + sort(p6, p7); + + sort(p1, p2); + sort(p4, p5); + sort(p7, p8); + + sort(p0, p3); + sort(p5, p8); + sort(p4, p7); + + sort(p3, p6); + sort(p1, p4); + sort(p2, p5); + + sort(p4, p7); + sort(p4, p2); + sort(p6, p4); + + sort(p4, p2); + + vst1_u8(output.ptr(), p4); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp new file mode 100644 index 0000000000..b188614752 --- /dev/null +++ b/src/core/NEON/kernels/NEMinMaxLocationKernel.cpp @@ -0,0 +1,361 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include + +namespace arm_compute +{ +NEMinMaxKernel::NEMinMaxKernel() + : _func(), _input(nullptr), _min(), _max(), _min_init(), _max_init(), _mtx() +{ +} + +void NEMinMaxKernel::configure(const IImage *input, int32_t *min, int32_t *max) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON(nullptr == min); + ARM_COMPUTE_ERROR_ON(nullptr == max); + + _input = input; + _min = min; + _max = max; + + switch(input->info()->format()) + { + case Format::U8: + _min_init = UCHAR_MAX; + _max_init = 0; + _func = &NEMinMaxKernel::minmax_U8; + break; + case Format::S16: + _min_init = SHRT_MAX; + _max_init = SHRT_MIN; + _func = &NEMinMaxKernel::minmax_S16; + break; + default: + ARM_COMPUTE_ERROR("You called with the wrong img formats"); + break; + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration)); + + INEKernel::configure(win); +} + +void NEMinMaxKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} + +void NEMinMaxKernel::reset() +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + *_min = _min_init; + *_max = _max_init; +} + +template +void NEMinMaxKernel::update_min_max(const T min, const T max) +{ + std::lock_guard lock(_mtx); + + if(min < *_min) + { + *_min = min; + } + + if(max > *_max) + { + *_max = max; + } +} + +void NEMinMaxKernel::minmax_U8(const Window &win) +{ + uint8x8_t carry_min = vdup_n_u8(UCHAR_MAX); + uint8x8_t carry_max = vdup_n_u8(0); + + Iterator input(_input, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x16_t pixels = vld1q_u8(input.ptr()); + const uint8x8_t tmp_min = vmin_u8(vget_high_u8(pixels), vget_low_u8(pixels)); + const uint8x8_t tmp_max = vmax_u8(vget_high_u8(pixels), vget_low_u8(pixels)); + carry_min = vmin_u8(tmp_min, carry_min); + carry_max = vmax_u8(tmp_max, carry_max); + }, + input); + + // Reduce result + carry_min = vpmin_u8(carry_min, carry_min); + carry_max = vpmax_u8(carry_max, carry_max); + carry_min = vpmin_u8(carry_min, carry_min); + carry_max = vpmax_u8(carry_max, carry_max); + carry_min = vpmin_u8(carry_min, carry_min); + carry_max = vpmax_u8(carry_max, carry_max); + + // Extract max/min values + const uint8_t min_i = vget_lane_u8(carry_min, 0); + const uint8_t max_i = vget_lane_u8(carry_max, 0); + + // Perform reduction of local min/max values + update_min_max(min_i, max_i); +} + +void NEMinMaxKernel::minmax_S16(const Window &win) +{ + int16x4_t carry_min = vdup_n_s16(SHRT_MAX); + int16x4_t carry_max = vdup_n_s16(SHRT_MIN); + + Iterator input(_input, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const int16x8x2_t pixels = vld2q_s16(in_ptr); + const int16x8_t tmp_min1 = vminq_s16(pixels.val[0], pixels.val[1]); + const int16x8_t tmp_max1 = vmaxq_s16(pixels.val[0], pixels.val[1]); + const int16x4_t tmp_min2 = vmin_s16(vget_high_s16(tmp_min1), vget_low_s16(tmp_min1)); + const int16x4_t tmp_max2 = vmax_s16(vget_high_s16(tmp_max1), vget_low_s16(tmp_max1)); + carry_min = vmin_s16(tmp_min2, carry_min); + carry_max = vmax_s16(tmp_max2, carry_max); + }, + input); + + // Reduce result + carry_min = vpmin_s16(carry_min, carry_min); + carry_max = vpmax_s16(carry_max, carry_max); + carry_min = vpmin_s16(carry_min, carry_min); + carry_max = vpmax_s16(carry_max, carry_max); + + // Extract max/min values + const int16_t min_i = vget_lane_s16(carry_min, 0); + const int16_t max_i = vget_lane_s16(carry_max, 0); + + // Perform reduction of local min/max values + update_min_max(min_i, max_i); +} + +NEMinMaxLocationKernel::NEMinMaxLocationKernel() + : _func(nullptr), _input(nullptr), _min(nullptr), _max(nullptr), _min_count(nullptr), _max_count(nullptr), _min_loc(nullptr), _max_loc(nullptr), _num_elems_processed_per_iteration(0) +{ +} + +bool NEMinMaxLocationKernel::is_parallelisable() const +{ + return false; +} + +template +struct index_seq +{ + index_seq() = default; + index_seq(const index_seq &) = default; + index_seq &operator=(const index_seq &) = default; + index_seq(index_seq &&) noexcept = default; + index_seq &operator=(index_seq &&) noexcept = default; + virtual ~index_seq() = default; +}; +template +struct gen_index_seq : gen_index_seq < N - 1, N - 1, S... > +{ +}; +template +struct gen_index_seq<0u, S...> : index_seq +{ + using type = index_seq; +}; + +template +struct NEMinMaxLocationKernel::create_func_table> +{ + static const NEMinMaxLocationKernel::MinMaxLocFunction func_table[sizeof...(N)]; +}; + +template +const NEMinMaxLocationKernel::MinMaxLocFunction NEMinMaxLocationKernel::create_func_table>::func_table[sizeof...(N)] = +{ + &NEMinMaxLocationKernel::minmax_loc... +}; + +void NEMinMaxLocationKernel::configure(const IImage *input, int32_t *min, int32_t *max, + ICoordinates2DArray *min_loc, ICoordinates2DArray *max_loc, + uint32_t *min_count, uint32_t *max_count) +{ + ARM_COMPUTE_ERROR_ON_TENSOR_NOT_2D(input); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8, Format::S16); + ARM_COMPUTE_ERROR_ON(nullptr == min); + ARM_COMPUTE_ERROR_ON(nullptr == max); + + _input = input; + _min = min; + _max = max; + _min_count = min_count; + _max_count = max_count; + _min_loc = min_loc; + _max_loc = max_loc; + + unsigned int count_min = (nullptr != min_count ? 1 : 0); + unsigned int count_max = (nullptr != max_count ? 1 : 0); + unsigned int loc_min = (nullptr != min_loc ? 1 : 0); + unsigned int loc_max = (nullptr != max_loc ? 1 : 0); + + unsigned int table_idx = (count_min << 3) | (count_max << 2) | (loc_min << 1) | loc_max; + + switch(input->info()->format()) + { + case Format::U8: + _func = create_func_table::type>::func_table[table_idx]; + break; + case Format::S16: + _func = create_func_table::type>::func_table[table_idx]; + break; + default: + ARM_COMPUTE_ERROR("You called with the wrong img formats"); + break; + } + + _num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(_num_elems_processed_per_iteration)); + + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, _num_elems_processed_per_iteration)); + + INEKernel::configure(win); +} + +void NEMinMaxLocationKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} + +template +void NEMinMaxLocationKernel::minmax_loc(const Window &win) +{ + if(count_min || count_max || loc_min || loc_max) + { + Iterator input(_input, win); + + size_t min_count = 0; + size_t max_count = 0; + unsigned int step = _num_elems_processed_per_iteration; + + // Clear min location array + if(loc_min) + { + _min_loc->clear(); + } + + // Clear max location array + if(loc_max) + { + _max_loc->clear(); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + auto in_ptr = reinterpret_cast(input.ptr()); + int32_t idx = id.x(); + int32_t idy = id.y(); + + for(unsigned int i = 0; i < step; ++i) + { + const T pixel = *in_ptr++; + Coordinates2D p{ idx++, idy }; + + if(count_min || loc_min) + { + if(*_min == pixel) + { + if(count_min) + { + ++min_count; + } + + if(loc_min) + { + _min_loc->push_back(p); + } + } + } + + if(count_max || loc_max) + { + if(*_max == pixel) + { + if(count_max) + { + ++max_count; + } + + if(loc_max) + { + _max_loc->push_back(p); + } + } + } + } + }, + input); + + if(count_min) + { + *_min_count = min_count; + } + + if(count_max) + { + *_max_count = max_count; + } + } +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NENonLinearFilterKernel.cpp b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp new file mode 100644 index 0000000000..03d1409be1 --- /dev/null +++ b/src/core/NEON/kernels/NENonLinearFilterKernel.cpp @@ -0,0 +1,1009 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" + +#include +#include +#include +#include +#include + +namespace arm_compute +{ +namespace +{ +const uint8x16_t zero_u8 = vdupq_n_u8(0); + +template +inline uint8x8_t min_row(uint8x16_t row_data) +{ + uint8x8_t min = vget_low_u8(row_data); + + for(size_t c = 1; c < columns; ++c) + { + row_data = vextq_u8(row_data, zero_u8, 1); + min = vmin_u8(min, vget_low_u8(row_data)); + } + + return min; +} + +template +inline uint8x8_t max_row(uint8x16_t row_data) +{ + uint8x8_t max = vget_low_u8(row_data); + + for(size_t c = 1; c < columns; ++c) + { + row_data = vextq_u8(row_data, zero_u8, 1); + max = vmax_u8(max, vget_low_u8(row_data)); + } + + return max; +} + +inline void sort(uint8x8_t &a, uint8x8_t &b) +{ + const uint8x8_t min = vmin_u8(a, b); + const uint8x8_t max = vmax_u8(a, b); + a = min; + b = max; +} + +// Sorting networks below were generated using http://pages.ripco.net/~jgamble/nw.html +// Calculations that do not affect the median were removed. +inline void sort5(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, uint8x8_t &p3, uint8x8_t &p4) +{ + sort(p0, p1); + sort(p2, p3); + sort(p0, p2); + sort(p1, p3); + sort(p1, p2); + sort(p0, p4); + sort(p1, p4); + sort(p2, p4); +} + +inline void sort9(uint8x8_t &p0, uint8x8_t &p1, uint8x8_t &p2, + uint8x8_t &p3, uint8x8_t &p4, uint8x8_t &p5, + uint8x8_t &p6, uint8x8_t &p7, uint8x8_t &p8) +{ + sort(p1, p2); + sort(p4, p5); + sort(p7, p8); + sort(p0, p1); + sort(p3, p4); + sort(p6, p7); + sort(p1, p2); + sort(p4, p5); + sort(p7, p8); + sort(p0, p3); + sort(p5, p8); + sort(p4, p7); + sort(p3, p6); + sort(p1, p4); + sort(p2, p5); + sort(p4, p7); + sort(p4, p2); + sort(p6, p4); + sort(p4, p2); +} + +inline void sort21(uint8x8_t p[21]) +{ + sort(p[0], p[1]); + sort(p[2], p[3]); + sort(p[4], p[5]); + sort(p[6], p[7]); + sort(p[8], p[9]); + sort(p[10], p[11]); + sort(p[12], p[13]); + sort(p[14], p[15]); + sort(p[16], p[17]); + sort(p[18], p[19]); + sort(p[0], p[2]); + sort(p[1], p[3]); + sort(p[4], p[6]); + sort(p[5], p[7]); + sort(p[8], p[10]); + sort(p[9], p[11]); + sort(p[12], p[14]); + sort(p[13], p[15]); + sort(p[16], p[18]); + sort(p[17], p[19]); + sort(p[1], p[2]); + sort(p[5], p[6]); + sort(p[0], p[4]); + sort(p[3], p[7]); + sort(p[9], p[10]); + sort(p[13], p[14]); + sort(p[8], p[12]); + sort(p[11], p[15]); + sort(p[17], p[18]); + sort(p[16], p[20]); + sort(p[1], p[5]); + sort(p[2], p[6]); + sort(p[9], p[13]); + sort(p[10], p[14]); + sort(p[0], p[8]); + sort(p[7], p[15]); + sort(p[17], p[20]); + sort(p[1], p[4]); + sort(p[3], p[6]); + sort(p[9], p[12]); + sort(p[11], p[14]); + sort(p[18], p[20]); + sort(p[0], p[16]); + sort(p[2], p[4]); + sort(p[3], p[5]); + sort(p[10], p[12]); + sort(p[11], p[13]); + sort(p[1], p[9]); + sort(p[6], p[14]); + sort(p[19], p[20]); + sort(p[3], p[4]); + sort(p[11], p[12]); + sort(p[1], p[8]); + sort(p[2], p[10]); + sort(p[5], p[13]); + sort(p[7], p[14]); + sort(p[3], p[11]); + sort(p[2], p[8]); + sort(p[4], p[12]); + sort(p[7], p[13]); + sort(p[1], p[17]); + sort(p[3], p[10]); + sort(p[5], p[12]); + sort(p[1], p[16]); + sort(p[2], p[18]); + sort(p[3], p[9]); + sort(p[6], p[12]); + sort(p[2], p[16]); + sort(p[3], p[8]); + sort(p[7], p[12]); + sort(p[5], p[9]); + sort(p[6], p[10]); + sort(p[4], p[8]); + sort(p[7], p[11]); + sort(p[3], p[19]); + sort(p[5], p[8]); + sort(p[7], p[10]); + sort(p[3], p[18]); + sort(p[4], p[20]); + sort(p[6], p[8]); + sort(p[7], p[9]); + sort(p[3], p[17]); + sort(p[5], p[20]); + sort(p[7], p[8]); + sort(p[3], p[16]); + sort(p[6], p[20]); + sort(p[5], p[17]); + sort(p[7], p[20]); + sort(p[4], p[16]); + sort(p[6], p[18]); + sort(p[5], p[16]); + sort(p[7], p[19]); + sort(p[7], p[18]); + sort(p[6], p[16]); + sort(p[7], p[17]); + sort(p[10], p[18]); + sort(p[7], p[16]); + sort(p[9], p[17]); + sort(p[8], p[16]); + sort(p[9], p[16]); + sort(p[10], p[16]); +} + +inline void sort25(uint8x8_t p[25]) +{ + sort(p[1], p[2]); + sort(p[0], p[1]); + sort(p[1], p[2]); + sort(p[4], p[5]); + sort(p[3], p[4]); + sort(p[4], p[5]); + sort(p[0], p[3]); + sort(p[2], p[5]); + sort(p[2], p[3]); + sort(p[1], p[4]); + sort(p[1], p[2]); + sort(p[3], p[4]); + sort(p[7], p[8]); + sort(p[6], p[7]); + sort(p[7], p[8]); + sort(p[10], p[11]); + sort(p[9], p[10]); + sort(p[10], p[11]); + sort(p[6], p[9]); + sort(p[8], p[11]); + sort(p[8], p[9]); + sort(p[7], p[10]); + sort(p[7], p[8]); + sort(p[9], p[10]); + sort(p[0], p[6]); + sort(p[4], p[10]); + sort(p[4], p[6]); + sort(p[2], p[8]); + sort(p[2], p[4]); + sort(p[6], p[8]); + sort(p[1], p[7]); + sort(p[5], p[11]); + sort(p[5], p[7]); + sort(p[3], p[9]); + sort(p[3], p[5]); + sort(p[7], p[9]); + sort(p[1], p[2]); + sort(p[3], p[4]); + sort(p[5], p[6]); + sort(p[7], p[8]); + sort(p[9], p[10]); + sort(p[13], p[14]); + sort(p[12], p[13]); + sort(p[13], p[14]); + sort(p[16], p[17]); + sort(p[15], p[16]); + sort(p[16], p[17]); + sort(p[12], p[15]); + sort(p[14], p[17]); + sort(p[14], p[15]); + sort(p[13], p[16]); + sort(p[13], p[14]); + sort(p[15], p[16]); + sort(p[19], p[20]); + sort(p[18], p[19]); + sort(p[19], p[20]); + sort(p[21], p[22]); + sort(p[23], p[24]); + sort(p[21], p[23]); + sort(p[22], p[24]); + sort(p[22], p[23]); + sort(p[18], p[21]); + sort(p[20], p[23]); + sort(p[20], p[21]); + sort(p[19], p[22]); + sort(p[22], p[24]); + sort(p[19], p[20]); + sort(p[21], p[22]); + sort(p[23], p[24]); + sort(p[12], p[18]); + sort(p[16], p[22]); + sort(p[16], p[18]); + sort(p[14], p[20]); + sort(p[20], p[24]); + sort(p[14], p[16]); + sort(p[18], p[20]); + sort(p[22], p[24]); + sort(p[13], p[19]); + sort(p[17], p[23]); + sort(p[17], p[19]); + sort(p[15], p[21]); + sort(p[15], p[17]); + sort(p[19], p[21]); + sort(p[13], p[14]); + sort(p[15], p[16]); + sort(p[17], p[18]); + sort(p[19], p[20]); + sort(p[21], p[22]); + sort(p[23], p[24]); + sort(p[0], p[12]); + sort(p[8], p[20]); + sort(p[8], p[12]); + sort(p[4], p[16]); + sort(p[16], p[24]); + sort(p[12], p[16]); + sort(p[2], p[14]); + sort(p[10], p[22]); + sort(p[10], p[14]); + sort(p[6], p[18]); + sort(p[6], p[10]); + sort(p[10], p[12]); + sort(p[1], p[13]); + sort(p[9], p[21]); + sort(p[9], p[13]); + sort(p[5], p[17]); + sort(p[13], p[17]); + sort(p[3], p[15]); + sort(p[11], p[23]); + sort(p[11], p[15]); + sort(p[7], p[19]); + sort(p[7], p[11]); + sort(p[11], p[13]); + sort(p[11], p[12]); +} +} // namespace + +NENonLinearFilterKernel::NENonLinearFilterKernel() + : _border_width(0), _input(nullptr), _output(nullptr), _mask(nullptr), _pattern(MatrixPattern::BOX), _function(NonLinearFilterFunction::MIN), _func_idx(0), _border_size() +{ +} + +BorderSize NENonLinearFilterKernel::border_size() const +{ + return _border_size; +} + +void NENonLinearFilterKernel::configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, + bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(3 != mask_size && 5 != mask_size); + ARM_COMPUTE_ERROR_ON(MatrixPattern::OTHER == pattern && nullptr == mask); + + // Set class variables + _border_size = BorderSize(mask_size / 2); + _input = input; + _output = output; + _mask = mask; + _pattern = pattern; + _function = function; + + // Configure kernel window + const unsigned int num_elems_processed_per_iteration = (MatrixPattern::OTHER == pattern) ? 1 : 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + + Window win = calculate_max_window(*input->info(), num_elems_processed_per_iteration, border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, mask_size), + output_access); + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); + + // Define function index + _func_idx = (3 == mask_size) ? 0 : 1; + + if(MatrixPattern::OTHER != pattern) + { + _func_idx = (_func_idx) * 3 + static_cast(function); + } +} + +void NENonLinearFilterKernel::fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern) +{ + unsigned int v = 0; + + for(int r = 0; r < rows; ++r) + { + for(int c = 0; c < cols; ++c, ++v) + { + uint8_t val = 0; + + switch(pattern) + { + case MatrixPattern::BOX: + val = 255; + break; + case MatrixPattern::CROSS: + val = ((r == (rows / 2)) || (c == (cols / 2))) ? 255 : 0; + break; + case MatrixPattern::DISK: + val = (((r - rows / 2.0f + 0.5f) * (r - rows / 2.0f + 0.5f)) / ((rows / 2.0f) * (rows / 2.0f)) + ((c - cols / 2.0f + 0.5f) * (c - cols / 2.0f + 0.5f)) / ((cols / 2.0f) * + (cols / 2.0f))) <= 1.0f ? 255 : 0; + break; + default: + return; + } + + mask[v] = val; + } + } +} + +template <> +void NENonLinearFilterKernel::median_filter_box<3, 3>(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + + const auto input_top_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, -1))); + const auto input_mid_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, 0))); + const auto input_bot_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, 1))); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + uint8x8_t p0 = vget_low_u8(top_data); + uint8x8_t p1 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 1); + uint8x8_t p2 = vext_u8(vget_low_u8(top_data), vget_high_u8(top_data), 2); + uint8x8_t p3 = vget_low_u8(mid_data); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p6 = vget_low_u8(bot_data); + uint8x8_t p7 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 1); + uint8x8_t p8 = vext_u8(vget_low_u8(bot_data), vget_high_u8(bot_data), 2); + + sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); + + vst1_u8(output.ptr(), p4); + }, + input, output); +} +template <> +void NENonLinearFilterKernel::median_filter_box<5, 5>(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + + const auto input_top2_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, -2))); + const auto input_top_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, -1))); + const auto input_mid_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 0))); + const auto input_bot_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 1))); + const auto input_bot2_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 2))); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset()); + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset()); + + const uint8x8_t d[] = + { + vget_low_u8(top2_data), + vget_high_u8(top2_data), + vget_low_u8(top_data), + vget_high_u8(top_data), + vget_low_u8(mid_data), + vget_high_u8(mid_data), + vget_low_u8(bot_data), + vget_high_u8(bot_data), + vget_low_u8(bot2_data), + vget_high_u8(bot2_data) + }; + + uint8x8_t p[25]; + for(unsigned int i = 0; i < 5; ++i) + { + const unsigned int idx_d = i * 2; + const unsigned int idx_p = i * 5; + + p[idx_p] = d[idx_d]; + p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1); + p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2); + p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3); + p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4); + } + + sort25(p); + + vst1_u8(output.ptr(), p[12]); + }, + input, output); +} + +template +void NENonLinearFilterKernel::min_filter_box(const Window &win) +{ + static_assert(mask_w > 0, "Mask size must not be 0"); + static_assert(mask_h > 0, "Mask size must not be 0"); + + Iterator input(_input, win); + Iterator output(_output, win); + + const int k_row_half = mask_h / 2; + const int k_col_half = mask_w / 2; + + // Set row pointers + std::array input_ptrs{ {} }; + for(int i = -k_row_half; i <= k_row_half; ++i) + { + input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + // Get min of rows + uint8x16_t rows_min = vld1q_u8(input_ptrs[0] + input.offset()); + + for(unsigned int r = 1; r < mask_h; ++r) + { + const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset()); + rows_min = vminq_u8(rows_min, data); + } + + const uint8x8_t out = min_row(rows_min); + + // Store result as U8 + vst1_u8(output.ptr(), out); + }, + input, output); +} + +template +void NENonLinearFilterKernel::max_filter_box(const Window &win) +{ + static_assert(mask_w > 0, "Mask size must not be 0"); + static_assert(mask_h > 0, "Mask size must not be 0"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + const int k_row_half = mask_h / 2; + const int k_col_half = mask_w / 2; + + // Set row pointers + std::array input_ptrs{ {} }; + for(int i = -k_row_half; i <= k_row_half; ++i) + { + input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + uint8x16_t rows_max = vld1q_u8(input_ptrs[0] + input.offset()); + + // Get max of rows + for(unsigned int r = 1; r < mask_h; ++r) + { + const uint8x16_t data = vld1q_u8(input_ptrs[r] + input.offset()); + rows_max = vmaxq_u8(rows_max, data); + } + + // Get max of columns + const uint8x8_t out = max_row(rows_max); + + // Store result as U8 + vst1_u8(output.ptr(), out); + }, + input, output); +} + +template <> +void NENonLinearFilterKernel::median_filter_cross<3, 3>(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + + const auto input_top_ptr = static_cast(_input->ptr_to_element(Coordinates(0, -1))); + const auto input_mid_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, 0))); + const auto input_bot_ptr = static_cast(_input->ptr_to_element(Coordinates(0, 1))); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset()); + + uint8x8_t p0 = top_data; + uint8x8_t p1 = vget_low_u8(mid_data); + uint8x8_t p2 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p4 = bot_data; + + sort5(p0, p1, p2, p3, p4); + + vst1_u8(output.ptr(), p2); + }, + input, output); +} + +template <> +void NENonLinearFilterKernel::median_filter_cross<5, 5>(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + + const auto input_top2_ptr = static_cast(_input->ptr_to_element(Coordinates(0, -2))); + const auto input_top_ptr = static_cast(_input->ptr_to_element(Coordinates(0, -1))); + const auto input_mid_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 0))); + const auto input_bot_ptr = static_cast(_input->ptr_to_element(Coordinates(0, 1))); + const auto input_bot2_ptr = static_cast(_input->ptr_to_element(Coordinates(0, 2))); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x8_t top2_data = vld1_u8(input_top2_ptr + input.offset()); + const uint8x8_t top_data = vld1_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x8_t bot_data = vld1_u8(input_bot_ptr + input.offset()); + const uint8x8_t bot2_data = vld1_u8(input_bot2_ptr + input.offset()); + + uint8x8_t p0 = top2_data; + uint8x8_t p1 = top_data; + uint8x8_t p2 = vget_low_u8(mid_data); + uint8x8_t p3 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 1); + uint8x8_t p4 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 2); + uint8x8_t p5 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 3); + uint8x8_t p6 = vext_u8(vget_low_u8(mid_data), vget_high_u8(mid_data), 4); + uint8x8_t p7 = bot_data; + uint8x8_t p8 = bot2_data; + + sort9(p0, p1, p2, p3, p4, p5, p6, p7, p8); + + vst1_u8(output.ptr(), p4); + }, + input, output); +} + +template +void NENonLinearFilterKernel::min_filter_cross(const Window &win) +{ + static_assert(mask_w > 0, "Mask size must not be 0"); + static_assert(mask_h > 0, "Mask size must not be 0"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + const int k_row_half = mask_h / 2; + const int k_col_half = mask_w / 2; + + const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0)); + + // Set row pointers + std::array input_ptrs{ {} }; + for(int i = -k_row_half; i <= k_row_half; ++i) + { + input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + uint8x8_t rows_min = vld1_u8(input_ptrs[0] + input.offset()); + + // Get min of rows + for(unsigned int r = 1; r < mask_h; ++r) + { + const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset()); + rows_min = vmin_u8(rows_min, data); + } + + // Get min of middle row + const uint8x16_t data = vld1q_u8(mid_ptr + input.offset()); + uint8x8_t out = min_row(data); + + // Get final min + out = vmin_u8(out, rows_min); + + // Store result as U8 + vst1_u8(output.ptr(), out); + }, + input, output); +} + +template +void NENonLinearFilterKernel::max_filter_cross(const Window &win) +{ + static_assert(mask_w > 0, "Mask size must not be 0"); + static_assert(mask_h > 0, "Mask size must not be 0"); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + Iterator input(_input, win); + Iterator output(_output, win); + + const int k_row_half = mask_h / 2; + const int k_col_half = mask_w / 2; + + const unsigned char *mid_ptr = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, 0)); + + // Set row pointers + std::array input_ptrs{ {} }; + for(int i = -k_row_half; i <= k_row_half; ++i) + { + input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(0, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + uint8x8_t rows_max = vld1_u8(input_ptrs[0] + input.offset()); + + // Get max of rows + for(unsigned int r = 1; r < mask_h; ++r) + { + const uint8x8_t data = vld1_u8(input_ptrs[r] + input.offset()); + rows_max = vmax_u8(rows_max, data); + } + + // Get max of middle row + const uint8x16_t data = vld1q_u8(mid_ptr + input.offset()); + uint8x8_t out = max_row(data); + + // Get final max + out = vmax_u8(out, rows_max); + + // Store result as U8 + vst1_u8(output.ptr(), out); + }, + input, output); +} + +template <> +void NENonLinearFilterKernel::median_filter_disk<5, 5>(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + + const auto input_top2_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, -2))); + const auto input_top_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, -1))); + const auto input_mid_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 0))); + const auto input_bot_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 1))); + const auto input_bot2_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, 2))); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset()); + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset()); + + uint8x8_t d[] = + { + vget_low_u8(top2_data), + vget_high_u8(top2_data), + vget_low_u8(top_data), + vget_high_u8(top_data), + vget_low_u8(mid_data), + vget_high_u8(mid_data), + vget_low_u8(bot_data), + vget_high_u8(bot_data), + vget_low_u8(bot2_data), + vget_high_u8(bot2_data) + }; + + uint8x8_t p[21]; + p[0] = d[0]; + p[1] = vext_u8(d[0], d[1], 1); + p[2] = vext_u8(d[0], d[1], 2); + p[18] = d[8]; + p[19] = vext_u8(d[8], d[9], 1); + p[20] = vext_u8(d[8], d[9], 2); + + for(unsigned int i = 0; i < 3; ++i) + { + const unsigned int idx_d = 2 + i * 2; + const unsigned int idx_p = 3 + i * 5; + + p[idx_p] = d[idx_d]; + p[idx_p + 1] = vext_u8(d[idx_d], d[idx_d + 1], 1); + p[idx_p + 2] = vext_u8(d[idx_d], d[idx_d + 1], 2); + p[idx_p + 3] = vext_u8(d[idx_d], d[idx_d + 1], 3); + p[idx_p + 4] = vext_u8(d[idx_d], d[idx_d + 1], 4); + } + + sort21(p); + + vst1_u8(output.ptr(), p[10]); + }, + input, output); +} + +template <> +void NENonLinearFilterKernel::min_filter_disk<5, 5>(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + + const auto input_top2_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, -2))); + const auto input_top_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, -1))); + const auto input_mid_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 0))); + const auto input_bot_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 1))); + const auto input_bot2_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, 2))); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset()); + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset()); + + const uint8x16_t rows_min_3 = vminq_u8(top2_data, bot2_data); + uint8x16_t rows_min_5 = vminq_u8(top_data, bot_data); + rows_min_5 = vminq_u8(rows_min_5, mid_data); + + const uint8x8_t out_3 = min_row<3>(rows_min_3); + const uint8x8_t out_5 = min_row<5>(rows_min_5); + + vst1_u8(output.ptr(), vmin_u8(out_3, out_5)); + }, + input, output); +} + +template <> +void NENonLinearFilterKernel::max_filter_disk<5, 5>(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + + const auto input_top2_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, -2))); + const auto input_top_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, -1))); + const auto input_mid_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 0))); + const auto input_bot_ptr = static_cast(_input->ptr_to_element(Coordinates(-2, 1))); + const auto input_bot2_ptr = static_cast(_input->ptr_to_element(Coordinates(-1, 2))); + + execute_window_loop(win, [&](const Coordinates & id) + { + const uint8x16_t top2_data = vld1q_u8(input_top2_ptr + input.offset()); + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + const uint8x16_t bot2_data = vld1q_u8(input_bot2_ptr + input.offset()); + + const uint8x16_t rows_max_3 = vmaxq_u8(top2_data, bot2_data); + uint8x16_t rows_max_5 = vmaxq_u8(top_data, bot_data); + rows_max_5 = vmaxq_u8(rows_max_5, mid_data); + + const uint8x8_t out_3 = max_row<3>(rows_max_3); + const uint8x8_t out_5 = max_row<5>(rows_max_5); + + vst1_u8(output.ptr(), vmax_u8(out_3, out_5)); + }, + input, output); +} + +template +void NENonLinearFilterKernel::non_linear_filter_generic(const Window &win) +{ + Iterator input(_input, win); + Iterator output(_output, win); + ARM_COMPUTE_ERROR_ON(_input->buffer() == nullptr); + + const int k_row_half = mask_h / 2; + const int k_col_half = mask_w / 2; + constexpr int mask_size = mask_w * mask_h; + + // Set row pointers + std::array input_ptrs{ {} }; + for(int i = -k_row_half; i <= k_row_half; ++i) + { + input_ptrs[k_row_half + i] = _input->buffer() + _input->info()->offset_element_in_bytes(Coordinates(-k_col_half, i)); + } + + execute_window_loop(win, [&](const Coordinates & id) + { + std::array vals{ {} }; + + size_t v = 0; + size_t m = 0; + + for(unsigned int r = 0; r < mask_h; ++r) + { + const auto in_ptr = static_cast(input_ptrs[r] + input.offset()); + + for(unsigned int c = 0; c < mask_w; ++c, ++m) + { + if(_mask[m] == 255) + { + vals[v] = in_ptr[c]; + ++v; + } + } + } + + // Only do something if there is at least one non-zero element in the + // mask + if(v > 0) + { + std::sort(vals.begin(), vals.begin() + v); + + switch(_function) + { + case NonLinearFilterFunction::MIN: + *output.ptr() = vals[0]; + break; + case NonLinearFilterFunction::MAX: + *output.ptr() = vals[v - 1]; + break; + case NonLinearFilterFunction::MEDIAN: + *output.ptr() = vals[v / 2]; + break; + default: + break; + } + } + }, + input, output); +} + +void NENonLinearFilterKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + using NonLinearFilterFunction = void (NENonLinearFilterKernel::*)(const Window & window); + + // Function table for BOX pattern + static const std::array func_table_box = + { + { + &NENonLinearFilterKernel::median_filter_box<3, 3>, + &NENonLinearFilterKernel::min_filter_box<3, 3>, + &NENonLinearFilterKernel::max_filter_box<3, 3>, + &NENonLinearFilterKernel::median_filter_box<5, 5>, + &NENonLinearFilterKernel::min_filter_box<5, 5>, + &NENonLinearFilterKernel::max_filter_box<5, 5>, + } + }; + + // Function table for CROSS pattern + static const std::array func_table_cross = + { + { + &NENonLinearFilterKernel::median_filter_cross<3, 3>, + &NENonLinearFilterKernel::min_filter_cross<3, 3>, + &NENonLinearFilterKernel::max_filter_cross<3, 3>, + &NENonLinearFilterKernel::median_filter_cross<5, 5>, + &NENonLinearFilterKernel::min_filter_cross<5, 5>, + &NENonLinearFilterKernel::max_filter_cross<5, 5>, + } + }; + + // Function table for DISK pattern + static const std::array func_table_disk = + { + { + &NENonLinearFilterKernel::median_filter_box<3, 3>, + &NENonLinearFilterKernel::min_filter_box<3, 3>, + &NENonLinearFilterKernel::max_filter_box<3, 3>, + &NENonLinearFilterKernel::median_filter_disk<5, 5>, + &NENonLinearFilterKernel::min_filter_disk<5, 5>, + &NENonLinearFilterKernel::max_filter_disk<5, 5>, + } + }; + + // Function table for OTHER pattern + static const std::array func_table_generic = + { + { + &NENonLinearFilterKernel::non_linear_filter_generic<3, 3>, + &NENonLinearFilterKernel::non_linear_filter_generic<5, 5>, + } + }; + + switch(_pattern) + { + case MatrixPattern::BOX: + ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_box.size()); + (this->*func_table_box[_func_idx])(window); + break; + case MatrixPattern::CROSS: + ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_cross.size()); + (this->*func_table_cross[_func_idx])(window); + break; + case MatrixPattern::DISK: + ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_disk.size()); + (this->*func_table_disk[_func_idx])(window); + break; + case MatrixPattern::OTHER: + default: + ARM_COMPUTE_ERROR_ON(_func_idx >= func_table_generic.size()); + (this->*func_table_generic[_func_idx])(window); + break; + } +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp new file mode 100644 index 0000000000..1826c474f7 --- /dev/null +++ b/src/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.cpp @@ -0,0 +1,513 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +#ifdef ARM_COMPUTE_ENABLE_FP16 +namespace fp16 +{ +inline void mask_top(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask) +{ + // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2] + mask = vandq_u16(mask, vcgeq_f16(vc, in0)); + mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 1))); + mask = vandq_u16(mask, vcgeq_f16(vc, vextq_f16(in0, in1, 2))); +} + +inline void mask_middle(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask) +{ + // vc >= nc.val[0], vc > nc.val[2] + mask = vandq_u16(mask, vcgeq_f16(vc, in0)); + mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2))); +} + +inline void mask_bottom(const float16x8_t &vc, const float16x8_t &in0, const float16x8_t &in1, uint16x8_t &mask) +{ + // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2] + mask = vandq_u16(mask, vcgtq_f16(vc, in0)); + mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 1))); + mask = vandq_u16(mask, vcgtq_f16(vc, vextq_f16(in0, in1, 2))); +} + +inline void non_maxima_suppression3x3_F32_F32(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride) +{ + auto in = static_cast(in_ptr) - 1; + const auto out = static_cast(out_ptr); + + // Get centre scores + const float16x8x2_t vc = + { + vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 1)), vcvt_f16_f32(vld1q_f32(in + 5))), + vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 9)), vcvt_f16_f32(vld1q_f32(in + 13))) + }; + + // Neighboring pixels + in -= in_stride; + + static const float16x4_t zero_f16x4 = vdup_n_f16(0); + static const uint16x8_t zero_u16 = vdupq_n_u16(0); + static const uint16x8_t true_mask = vceqq_u16(zero_u16, zero_u16); + static const uint16x8x2_t true_mask_x2 = + { + true_mask, + true_mask + }; + + uint16x8x2_t mask = true_mask_x2; + + // Top row + const float16x8_t tmp_top0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4))); + const float16x8_t tmp_top1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12))); + const float16x8_t tmp_top2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4); + + // vc >= nc.val[0], vc >= nc.val[1], vc >= nc.val[2] + mask_top(vc.val[0], tmp_top0, tmp_top1, mask.val[0]); + mask_top(vc.val[1], tmp_top1, tmp_top2, mask.val[1]); + + in += in_stride; + + // Middle row + const float16x8_t tmp_mid0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4))); + const float16x8_t tmp_mid1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12))); + const float16x8_t tmp_mid2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4); + + // vc >= nc.val[0], vc > nc.val[2] + mask_middle(vc.val[0], tmp_mid0, tmp_mid1, mask.val[0]); + mask_middle(vc.val[1], tmp_mid1, tmp_mid2, mask.val[1]); + + in += in_stride; + + // Bottom row + const float16x8_t tmp_bot0 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in)), vcvt_f16_f32(vld1q_f32(in + 4))); + const float16x8_t tmp_bot1 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 8)), vcvt_f16_f32(vld1q_f32(in + 12))); + const float16x8_t tmp_bot2 = vcombine_f16(vcvt_f16_f32(vld1q_f32(in + 16)), zero_f16x4); + + // vc > nc.val[0], vc > nc.val[1], vc > nc.val[2] + mask_bottom(vc.val[0], tmp_bot0, tmp_bot1, mask.val[0]); + mask_bottom(vc.val[1], tmp_bot1, tmp_bot2, mask.val[1]); + + // Store + static const float16x8_t zero_f16x8 = vdupq_n_f16(0); + + const float16x8_t suppressed0 = vbslq_f16(mask.val[0], vc.val[0], zero_f16x8); + vst1q_f32(out + 0, vcvt_f32_f16(vget_low_f16(suppressed0))); + vst1q_f32(out + 4, vcvt_f32_f16(vget_high_f16(suppressed0))); + + const float16x8_t suppressed1 = vbslq_f16(mask.val[1], vc.val[1], zero_f16x8); + vst1q_f32(out + 8, vcvt_f32_f16(vget_low_f16(suppressed1))); + vst1q_f32(out + 12, vcvt_f32_f16(vget_high_f16(suppressed1))); +} + +inline void non_maxima_suppression3x3_U8_U8(const void *__restrict in_ptr, void *__restrict out_ptr, const uint32_t in_stride) +{ + auto in = static_cast(in_ptr) - 1; + const auto out = static_cast(out_ptr); + + // Get centre scores + const uint8x16_t vc = vld1q_u8(in + 1); + + // Neighboring pixels + in -= in_stride; + + // Top row + const uint8x16_t l_nc_0 = vld1q_u8(in); + const uint8x16_t m_nc_0 = vld1q_u8(in + 1); + const uint8x16_t r_nc_0 = vld1q_u8(in + 2); + + // Keep center scores if ... + // vc >= l_nc_0, vc >= m_nc_0, vc >= r_nc_0 + uint8x16_t mask = vcgeq_u8(vc, l_nc_0); + mask = vandq_u8(mask, vcgeq_u8(vc, m_nc_0)); + mask = vandq_u8(mask, vcgeq_u8(vc, r_nc_0)); + + in += in_stride; + + // Middle row + const uint8x16_t l_nc_1 = vld1q_u8(in); + const uint8x16_t r_nc_1 = vld1q_u8(in + 2); + + // ... and ... + // vc >= l_nc_1, vc > r_nc_1 + mask = vandq_u8(mask, vcgeq_u8(vc, l_nc_1)); + mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_1)); + + in += in_stride; + + // Bottom row + const uint8x16_t l_nc_2 = vld1q_u8(in); + const uint8x16_t m_nc_2 = vld1q_u8(in + 1); + const uint8x16_t r_nc_2 = vld1q_u8(in + 2); + + // ... and ... + // vc > l_nc_2, vc > m_nc_2, vc > r_nc_2 + mask = vandq_u8(mask, vcgtq_u8(vc, l_nc_2)); + mask = vandq_u8(mask, vcgtq_u8(vc, m_nc_2)); + mask = vandq_u8(mask, vcgtq_u8(vc, r_nc_2)); + + // Store + static const uint8x16_t zero = vdupq_n_u8(0); + vst1q_u8(out, vbslq_u8(mask, vc, zero)); +} +} // namespace fp16 + +void NENonMaximaSuppression3x3FP16Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + switch(input->info()->data_type()) + { + case DataType::U8: + _func = &fp16::non_maxima_suppression3x3_U8_U8; + break; + default: + _func = &fp16::non_maxima_suppression3x3_F32_F32; + break; + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + const unsigned int num_elems_read_per_iteration = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3); + constexpr unsigned int num_elems_written_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} +#endif + +namespace +{ +inline void non_maxima_suppression3x3_FLOAT_FLOAT(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride) +{ + auto input = static_cast(input_ptr) - 1; + const auto output = static_cast(output_ptr); + + // Get centre scores + const float32x4x4_t vc = + { + { + vld1q_f32(input + 1), + vld1q_f32(input + 5), + vld1q_f32(input + 9), + vld1q_f32(input + 13) + } + }; + + // Neighboring pixels + float32x4x4_t l_nc{ {} }; + float32x4x4_t m_nc{ {} }; + float32x4x4_t r_nc{ {} }; + + input -= input_stride; + + // Row0 - Low part + float32x4_t tmp_low = vld1q_f32(input); + float32x4_t tmp_high = vld1q_f32(input + 4); + float32x4_t tmp_high1 = vld1q_f32(input + 8); + + l_nc.val[0] = tmp_low; + m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2); + + tmp_low = tmp_high; + tmp_high = tmp_high1; + + l_nc.val[1] = tmp_low; + m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2); + + // Row0 - High part + tmp_low = tmp_high1; + tmp_high = vld1q_f32(input + 12); + tmp_high1 = vld1q_f32(input + 16); + + l_nc.val[2] = tmp_low; + m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2); + + tmp_low = tmp_high; + tmp_high = tmp_high1; + + l_nc.val[3] = tmp_low; + m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2); + + // mc >= nc.val[0], mc >= nc.val[1], mc >= nc.val[2] + uint32x4x4_t mask{ {} }; + mask.val[0] = vcgeq_f32(vc.val[0], l_nc.val[0]); + mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], m_nc.val[0])); + mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], r_nc.val[0])); + mask.val[1] = vcgeq_f32(vc.val[1], l_nc.val[1]); + mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], m_nc.val[1])); + mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], r_nc.val[1])); + mask.val[2] = vcgeq_f32(vc.val[2], l_nc.val[2]); + mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], m_nc.val[2])); + mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], r_nc.val[2])); + mask.val[3] = vcgeq_f32(vc.val[3], l_nc.val[3]); + mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], m_nc.val[3])); + mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], r_nc.val[3])); + + input += input_stride; + + // Row1 - Low part + tmp_low = vld1q_f32(input); + tmp_high = vld1q_f32(input + 4); + tmp_high1 = vld1q_f32(input + 8); + + l_nc.val[0] = tmp_low; + r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2); + + tmp_low = tmp_high; + tmp_high = tmp_high1; + + l_nc.val[1] = tmp_low; + r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2); + + // Row1 - High part + tmp_low = tmp_high1; + tmp_high = vld1q_f32(input + 12); + tmp_high1 = vld1q_f32(input + 16); + + l_nc.val[2] = tmp_low; + r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2); + + tmp_low = tmp_high; + tmp_high = tmp_high1; + + l_nc.val[3] = tmp_low; + r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2); + + // mc >= nc.val[0], mc > nc.val[2] + mask.val[0] = vandq_u32(mask.val[0], vcgeq_f32(vc.val[0], l_nc.val[0])); + mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0])); + mask.val[1] = vandq_u32(mask.val[1], vcgeq_f32(vc.val[1], l_nc.val[1])); + mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1])); + mask.val[2] = vandq_u32(mask.val[2], vcgeq_f32(vc.val[2], l_nc.val[2])); + mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2])); + mask.val[3] = vandq_u32(mask.val[3], vcgeq_f32(vc.val[3], l_nc.val[3])); + mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3])); + + input += input_stride; + + // Row2 - Low part + tmp_low = vld1q_f32(input); + tmp_high = vld1q_f32(input + 4); + tmp_high1 = vld1q_f32(input + 8); + + l_nc.val[0] = tmp_low; + m_nc.val[0] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[0] = vextq_f32(tmp_low, tmp_high, 2); + + tmp_low = tmp_high; + tmp_high = tmp_high1; + + l_nc.val[1] = tmp_low; + m_nc.val[1] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[1] = vextq_f32(tmp_low, tmp_high, 2); + + // Row2 - High part + tmp_low = tmp_high1; + tmp_high = vld1q_f32(input + 12); + tmp_high1 = vld1q_f32(input + 16); + + l_nc.val[2] = tmp_low; + m_nc.val[2] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[2] = vextq_f32(tmp_low, tmp_high, 2); + + tmp_low = tmp_high; + tmp_high = tmp_high1; + + l_nc.val[3] = tmp_low; + m_nc.val[3] = vextq_f32(tmp_low, tmp_high, 1); + r_nc.val[3] = vextq_f32(tmp_low, tmp_high, 2); + + // mc > nc.val[0], mc > nc.val[1], mc > nc.val[2] + mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], l_nc.val[0])); + mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], m_nc.val[0])); + mask.val[0] = vandq_u32(mask.val[0], vcgtq_f32(vc.val[0], r_nc.val[0])); + mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], l_nc.val[1])); + mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], m_nc.val[1])); + mask.val[1] = vandq_u32(mask.val[1], vcgtq_f32(vc.val[1], r_nc.val[1])); + mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], l_nc.val[2])); + mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], m_nc.val[2])); + mask.val[2] = vandq_u32(mask.val[2], vcgtq_f32(vc.val[2], r_nc.val[2])); + mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], l_nc.val[3])); + mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], m_nc.val[3])); + mask.val[3] = vandq_u32(mask.val[3], vcgtq_f32(vc.val[3], r_nc.val[3])); + + static const float32x4_t zero = vdupq_n_f32(0.f); + + // Store + vst1q_f32(output + 0, vbslq_f32(mask.val[0], vc.val[0], zero)); + vst1q_f32(output + 4, vbslq_f32(mask.val[1], vc.val[1], zero)); + vst1q_f32(output + 8, vbslq_f32(mask.val[2], vc.val[2], zero)); + vst1q_f32(output + 12, vbslq_f32(mask.val[3], vc.val[3], zero)); +} + +inline void non_maxima_suppression3x3_U8_U8(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride) +{ + auto input = static_cast(input_ptr) - 1; + const auto output = static_cast(output_ptr); + + // Get centre scores + const uint8x16_t vc = vld1q_u8(input + 1); + + // Neighboring pixels + uint8x16_t l_nc{}; + uint8x16_t m_nc{}; + uint8x16_t r_nc{}; + + input -= input_stride; + + // Row0 + l_nc = vld1q_u8(input); + m_nc = vld1q_u8(input + 1); + r_nc = vld1q_u8(input + 2); + + // mc >= l_nc, mc >= m_nc, mc >= r_nc + uint8x16_t mask = vcgeq_u8(vc, l_nc); + mask = vandq_u8(mask, vcgeq_u8(vc, m_nc)); + mask = vandq_u8(mask, vcgeq_u8(vc, r_nc)); + + input += input_stride; + + // Row1 + l_nc = vld1q_u8(input); + r_nc = vld1q_u8(input + 2); + + // mc >= l_nc, mc > r_nc + mask = vandq_u8(mask, vcgeq_u8(vc, l_nc)); + mask = vandq_u8(mask, vcgtq_u8(vc, r_nc)); + + input += input_stride; + + // Row2 + l_nc = vld1q_u8(input); + m_nc = vld1q_u8(input + 1); + r_nc = vld1q_u8(input + 2); + + // mc > l_nc, mc > m_nc, mc > r_nc + mask = vandq_u8(mask, vcgtq_u8(vc, l_nc)); + mask = vandq_u8(mask, vcgtq_u8(vc, m_nc)); + mask = vandq_u8(mask, vcgtq_u8(vc, r_nc)); + + static const uint8x16_t zero = vdupq_n_u8(0); + + // Store + vst1q_u8(output, vbslq_u8(mask, vc, zero)); +} +} // namespace + +NENonMaximaSuppression3x3Kernel::NENonMaximaSuppression3x3Kernel() + : _func(nullptr), _input(nullptr), _output(nullptr) +{ +} + +BorderSize NENonMaximaSuppression3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void NENonMaximaSuppression3x3Kernel::configure(const ITensor *input, ITensor *output, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _input = input; + _output = output; + + if(input->info()->data_type() == DataType::U8) + { + _func = &non_maxima_suppression3x3_U8_U8; + } + else + { + _func = &non_maxima_suppression3x3_FLOAT_FLOAT; + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + const unsigned int num_elems_read_per_iteration = 16 + 2 * border_size().left + (input->info()->data_type() == DataType::U8 ? 0 : 3); + constexpr unsigned int num_elems_written_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 3; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NENonMaximaSuppression3x3Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + Iterator input(_input, window); + Iterator output(_output, window); + + const size_t input_stride = _input->info()->strides_in_bytes()[1] / element_size_from_data_type(_input->info()->data_type()); + + execute_window_loop(window, [&](const Coordinates & id) + { + _func(input.ptr(), output.ptr(), input_stride); + }, + input, output); +} diff --git a/src/core/NEON/kernels/NENormalizationLayerKernel.cpp b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp new file mode 100644 index 0000000000..a971dc8d97 --- /dev/null +++ b/src/core/NEON/kernels/NENormalizationLayerKernel.cpp @@ -0,0 +1,218 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +using namespace arm_compute; + +NENormalizationLayerKernel::NENormalizationLayerKernel() + : _func(nullptr), _input(nullptr), _input_squared(nullptr), _output(nullptr), _norm_info(NormType::IN_MAP_1D), _border_size() +{ +} + +BorderSize NENormalizationLayerKernel::border_size() const +{ + return _border_size; +} + +void NENormalizationLayerKernel::configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, input_squared); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, input_squared, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, input_squared, output); + ARM_COMPUTE_ERROR_ON_MSG(!(norm_info.norm_size() % 2), "Normalization size should be odd"); + ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.beta(), input); + ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.kappa(), input); + ARM_COMPUTE_ERROR_ON_VALUE_NOT_REPRESENTABLE_IN_FIXED_POINT(norm_info.scale_coeff(), input); + + const unsigned int border_width = (norm_info.type() == NormType::CROSS_MAP) ? 0 : std::min(norm_info.norm_size() / 2, 3U); + + _input = input; + _input_squared = input_squared; + _output = output; + _norm_info = norm_info; + _border_size = BorderSize(0, border_width); + + const bool is_dt_f32 = _input->info()->data_type() == DataType::F32; + + switch(norm_info.type()) + { + case NormType::IN_MAP_1D: + _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, false> : &NENormalizationLayerKernel::normalize_fixed_point<0, false>; + break; + case NormType::IN_MAP_2D: + // Normalize over X and Y + _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<0, true> : &NENormalizationLayerKernel::normalize_fixed_point<0, true>; + break; + case NormType::CROSS_MAP: + _func = (is_dt_f32) ? &NENormalizationLayerKernel::normalize<2, false> : &NENormalizationLayerKernel::normalize_fixed_point<2, false>; + break; + default: + ARM_COMPUTE_ERROR("NOT SUPPORTED!"); + } + + const unsigned int num_elems_processed_per_iteration = (is_dt_f32) ? 4 : 16; + const unsigned int num_elems_read_per_iteration = num_elems_processed_per_iteration + 2 * (norm_info.norm_size() / 2); + const unsigned int num_rows = (norm_info.type() == NormType::IN_MAP_2D) ? norm_info.norm_size() : 1; + + // Configure window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle input_access(input->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows); + AccessWindowRectangle input_squared_access(input_squared->info(), -_border_size.left, 0, num_elems_read_per_iteration, num_rows); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, input_squared_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +template +void NENormalizationLayerKernel::normalize(const Window &window) +{ + Iterator input(_input, window); + Iterator input_squared(_input_squared, window); + Iterator output(_output, window); + + const int dim_y = 1; + const int radius = _norm_info.norm_size() / 2; + const int total_size = _input->info()->dimension(dim) - 1; + const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim]; + // We account padding across X only and we iterate over rows + const int min_left = (dim == 2) ? 0 : -static_cast(border_size().left); + const int max_right = (dim == 2) ? total_size : total_size + border_size().left; + const int min_top = 0; + const int max_bottom = _input->info()->dimension(dim_y) - 1; + + const float32x4_t coeff_vec = vdupq_n_f32(_norm_info.scale_coeff()); + const float32x4_t beta_vec = vdupq_n_f32(_norm_info.beta()); + const float32x4_t kappa_vec = vdupq_n_f32(_norm_info.kappa()); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + float32x4_t accu = vdupq_n_f32(0.f); + for(int j = first_row; j <= last_row; j++) + { + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride); + for(int i = first_slice; i <= last_slice; ++i) + { + accu = vaddq_f32(accu, vld1q_f32(reinterpret_cast(input_squared_ptr + i * input_squared_stride))); + } + } + + // Normalize + const float32x4_t normalized = vpowq_f32(vmlaq_f32(kappa_vec, coeff_vec, accu), beta_vec); + const float32x4_t normalized_pixel = vmulq_f32(vld1q_f32(reinterpret_cast(input.ptr())), vinvq_f32(normalized)); + vst1q_f32(reinterpret_cast(output.ptr()), normalized_pixel); + }, + input, input_squared, output); +} + +template +void NENormalizationLayerKernel::normalize_fixed_point(const Window &window) +{ + Iterator input(_input, window); + Iterator input_squared(_input_squared, window); + Iterator output(_output, window); + + const int dim_y = 1; + const int radius = _norm_info.norm_size() / 2; + const int total_size = _input->info()->dimension(dim) - 1; + const int input_squared_stride = _input_squared->info()->strides_in_bytes()[dim]; + // We account padding across X only and we iterate over rows + const int min_left = (dim == 2) ? 0 : -static_cast(border_size().left); + const int max_right = (dim == 2) ? total_size : total_size + border_size().left; + const int min_top = 0; + const int max_bottom = _input->info()->dimension(dim_y) - 1; + + const int fixed_point_position = _input->info()->fixed_point_position(); + + const qint8x16_t coeff_vec = vdupq_n_qs8_f32(_norm_info.scale_coeff(), fixed_point_position); + const qint8x16_t beta_vec = vdupq_n_qs8_f32(_norm_info.beta(), fixed_point_position); + const qint8x16_t kappa_vec = vdupq_n_qs8_f32(_norm_info.kappa(), fixed_point_position); + + execute_window_loop(window, [&](const Coordinates & id) + { + // Get range to normalize + const int current_row = do_2D_norm ? id[dim_y] : 0; + const int current_slice = id[dim]; + const int first_row = do_2D_norm ? std::max(current_row - radius, min_top) : 0; + const int last_row = do_2D_norm ? std::min(current_row + radius, max_bottom) : 0; + const int first_slice = std::max(current_slice - radius, min_left); + const int last_slice = std::min(current_slice + radius, max_right); + + // Accumulate 2D In-Map values + qint8x16_t accu = vdupq_n_qs8(0); + for(int j = first_row; j <= last_row; ++j) + { + // Compute row displacement + const int row = (j - current_row) * _input_squared->info()->strides_in_bytes()[dim_y]; + const uint8_t *const input_squared_ptr = input_squared.ptr() + row - (current_slice * input_squared_stride); + for(int i = first_slice; i <= last_slice; ++i) + { + accu = vqaddq_qs8(accu, vld1q_qs8(reinterpret_cast(input_squared_ptr + i * input_squared_stride))); + } + } + + // Normalize + const qint8x16_t accu_scale = vqmlaq_qs8(kappa_vec, coeff_vec, accu, fixed_point_position); + const qint8x16_t normalized = vqpowq_qs8(accu_scale, beta_vec, fixed_point_position); + const qint8x16_t normalized_pixel = vdivq_qs8(vld1q_qs8(reinterpret_cast(input.ptr())), normalized, fixed_point_position); + vst1q_qs8(reinterpret_cast(output.ptr()), normalized_pixel); + }, + input, input_squared, output); +} + +void NENormalizationLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + // Run function + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp new file mode 100644 index 0000000000..aa8c7a1847 --- /dev/null +++ b/src/core/NEON/kernels/NEPixelWiseMultiplicationKernel.cpp @@ -0,0 +1,524 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/runtime/NEON/functions/NEPixelWiseMultiplication.h" + +#include +#include +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +const float scale255_constant = 1.f / 255.f; +const float32x4_t scale255_constant_f32q = vdupq_n_f32(scale255_constant); +const float32x4_t positive_round_f32q = vdupq_n_f32(0.5f); + +/* Scales a given vector by 1/255. + * + * @note This does not work for all cases. e.g. for float of 0.49999999999999994 and large floats. + * + * @param in Input vector to scale. + * @return Scaled output rounded to nearest (round half up). + */ +inline int32x4_t scale255_S32_S32(int32x4_t in) +{ + // Scale + const float32x4_t tmp = vmulq_f32(vcvtq_f32_s32(in), scale255_constant_f32q); + // Round to nearest (round half up) + // Add +0.5 for all values + // Afterwards vcvt rounds toward zero + return vcvtq_s32_f32(vaddq_f32(tmp, positive_round_f32q)); +} + +inline uint16x8_t scale255_U16_U16(uint16x8_t in) +{ + const int32x4_t tmp_s1 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(in)))); + const int32x4_t tmp_s2 = scale255_S32_S32(vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(in)))); + return vreinterpretq_u16_s16(vcombine_s16(vmovn_s32(tmp_s2), vmovn_s32(tmp_s1))); +} + +template +void mul_U8_U8_U8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n) +{ + const auto input1 = static_cast(input1_ptr); + const auto input2 = static_cast(input2_ptr); + const auto output = static_cast(output_ptr); + + const uint8x16_t ta1 = vld1q_u8(input1); + const uint8x16_t ta2 = vld1q_u8(input2); + + uint16x8_t tmp1_high = vmovl_u8(vget_high_u8(ta1)); + const uint16x8_t tmp2_high = vmovl_u8(vget_high_u8(ta2)); + uint16x8_t tmp1_low = vmovl_u8(vget_low_u8(ta1)); + const uint16x8_t tmp2_low = vmovl_u8(vget_low_u8(ta2)); + + tmp1_high = vmulq_u16(tmp1_high, tmp2_high); + tmp1_low = vmulq_u16(tmp1_low, tmp2_low); + + if(is_scale255) + { + tmp1_high = scale255_U16_U16(tmp1_high); + tmp1_low = scale255_U16_U16(tmp1_low); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if(is_sat) + { + tmp1_high = vqshlq_u16(tmp1_high, vn); + tmp1_low = vqshlq_u16(tmp1_low, vn); + } + else + { + tmp1_high = vshlq_u16(tmp1_high, vn); + tmp1_low = vshlq_u16(tmp1_low, vn); + } + } + + if(is_sat) + { + vst1q_u8(output, vcombine_u8(vqmovn_u16(tmp1_low), vqmovn_u16(tmp1_high))); + } + else + { + vst1q_u8(output, vcombine_u8(vmovn_u16(tmp1_low), vmovn_u16(tmp1_high))); + } +} + +template +void mul_QS8_QS8_QS8_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n, int fixed_point_position) +{ + // n is the exponent of the scaling factor, that is scale = 1/2^n. Currently, we only support scaling factor equal to 1 => n = 0. + ARM_COMPUTE_ERROR_ON_MSG(n != 0, "Scaling factor different than 1 not supported for 8-bit fixed-point pixel-wise multiplication"); + ARM_COMPUTE_UNUSED(n); + + const auto input1 = static_cast(input1_ptr); + const auto input2 = static_cast(input2_ptr); + const auto output = static_cast(output_ptr); + + const qint8x16_t ta1 = vld1q_qs8(input1); + const qint8x16_t ta2 = vld1q_qs8(input2); + + qint8x16_t res = (is_sat) ? vqmulq_qs8(ta1, ta2, fixed_point_position) : vmulq_qs8(ta1, ta2, fixed_point_position); + + vst1q_s8(output, res); +} + +template +inline int16x8_t mul_S16_S16_S16_n_loop(const int16x8_t &input1, const int16x8_t &input2, int n) +{ + int32x4_t tmp1_high = vmovl_s16(vget_high_s16(input1)); + const int32x4_t tmp2_high = vmovl_s16(vget_high_s16(input2)); + int32x4_t tmp1_low = vmovl_s16(vget_low_s16(input1)); + const int32x4_t tmp2_low = vmovl_s16(vget_low_s16(input2)); + + tmp1_high = vmulq_s32(tmp1_high, tmp2_high); + tmp1_low = vmulq_s32(tmp1_low, tmp2_low); + + if(is_scale255) + { + tmp1_high = scale255_S32_S32(tmp1_high); + tmp1_low = scale255_S32_S32(tmp1_low); + } + else + { + // Right shift amount + const int32x4_t vn = vdupq_n_s32(-n); + // Left shift amount + const int32x4_t vnl = vdupq_n_s32(n); + // Calculate conversion bit + const uint32x4_t tmp1_high_u = vreinterpretq_u32_s32(tmp1_high); + const uint32x4_t tmp1_low_u = vreinterpretq_u32_s32(tmp1_low); + const uint32x4_t sign_high = vshrq_n_u32(tmp1_high_u, 31); + const uint32x4_t sign_low = vshrq_n_u32(tmp1_low_u, 31); + const int32x4_t sign_high_s = vreinterpretq_s32_u32(sign_high); + const int32x4_t sign_low_s = vreinterpretq_s32_u32(sign_low); + const int32x4_t convert_high = vsubq_s32(vshlq_s32(sign_high_s, vnl), sign_high_s); + const int32x4_t convert_low = vsubq_s32(vshlq_s32(sign_low_s, vnl), sign_low_s); + if(is_sat) + { + tmp1_high = vqshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vqshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + else + { + tmp1_high = vshlq_s32(vaddq_s32(tmp1_high, convert_high), vn); + tmp1_low = vshlq_s32(vaddq_s32(tmp1_low, convert_low), vn); + } + } + + if(is_sat) + { + return vcombine_s16(vqmovn_s32(tmp1_low), vqmovn_s32(tmp1_high)); + } + else + { + return vcombine_s16(vmovn_s32(tmp1_low), vmovn_s32(tmp1_high)); + } +} + +template +inline int16x8x2_t mul_S16_S16_S16_n_k(const int16x8x2_t &input1, const int16x8x2_t &input2, int n) +{ + const int16x8x2_t result = + { + { + // First 8 elements + mul_S16_S16_S16_n_loop(input1.val[0], input2.val[0], n), + // Second 8 elements + mul_S16_S16_S16_n_loop(input1.val[1], input2.val[1], n) + } + }; + + return result; +} + +template +void mul_S16_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n) +{ + const auto input1 = static_cast(input1_ptr); + const auto input2 = static_cast(input2_ptr); + const auto output = static_cast(output_ptr); + + const int16x8x2_t ta1 = vld2q_s16(input1); + const int16x8x2_t ta2 = vld2q_s16(input2); + const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); + + vst2q_s16(output, result); +} + +template +void mul_F32_F32_F32_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale) +{ + const auto input1 = static_cast(input1_ptr); + const auto input2 = static_cast(input2_ptr); + const auto output = static_cast(output_ptr); + + const float32x4x4_t ta1 = vld4q_f32(input1); + const float32x4x4_t ta2 = vld4q_f32(input2); + const float32x4_t scale_vec = vdupq_n_f32(scale); + const float32x4x4_t result = + { + { + vmulq_f32(vmulq_f32(ta1.val[0], ta2.val[0]), scale_vec), + vmulq_f32(vmulq_f32(ta1.val[1], ta2.val[1]), scale_vec), + vmulq_f32(vmulq_f32(ta1.val[2], ta2.val[2]), scale_vec), + vmulq_f32(vmulq_f32(ta1.val[3], ta2.val[3]), scale_vec) + } + }; + vst4q_f32(output, result); +} + +template +void mul_U8_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n) +{ + const auto input1 = static_cast(input1_ptr); + const auto input2 = static_cast(input2_ptr); + const auto output = static_cast(output_ptr); + + const uint8x16_t bv = vld1q_u8(input2); + const uint8x16_t av = vld1q_u8(input1); + + uint16x8_t tmp_low = vmovl_u8(vget_low_u8(av)); + uint16x8_t tmp_high = vmovl_u8(vget_high_u8(av)); + tmp_low = vmulq_u16(tmp_low, vmovl_u8(vget_low_u8(bv))); + tmp_high = vmulq_u16(tmp_high, vmovl_u8(vget_high_u8(bv))); + + if(is_scale255) + { + tmp_low = scale255_U16_U16(tmp_low); + tmp_high = scale255_U16_U16(tmp_high); + } + else + { + const int16x8_t vn = vdupq_n_s16(-n); + + if(is_sat) + { + tmp_low = vqshlq_u16(tmp_low, vn); + tmp_high = vqshlq_u16(tmp_high, vn); + } + else + { + tmp_low = vshlq_u16(tmp_low, vn); + tmp_high = vshlq_u16(tmp_high, vn); + } + } + + if(is_sat) + { + static const uint16x8_t max = vdupq_n_u16(SHRT_MAX); + + tmp_low = vminq_u16(tmp_low, max); + tmp_high = vminq_u16(tmp_high, max); + } + + vst1q_s16(output, vreinterpretq_s16_u16(tmp_low)); + vst1q_s16(output + 8, vreinterpretq_s16_u16(tmp_high)); +} + +template +void mul_S16_U8_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n) +{ + const auto input1 = static_cast(input1_ptr); + const auto input2 = static_cast(input2_ptr); + const auto output = static_cast(output_ptr); + + const int16x8x2_t ta1 = vld2q_s16(input1); + const uint8x8x2_t ta2u = vld2_u8(input2); + const int16x8x2_t ta2 = + { + { + vreinterpretq_s16_u16(vmovl_u8(ta2u.val[0])), + vreinterpretq_s16_u16(vmovl_u8(ta2u.val[1])) + } + }; + + const int16x8x2_t result = mul_S16_S16_S16_n_k(ta1, ta2, n); + + vst2q_s16(output, result); +} + +template +void mul_U8_S16_S16_n(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int n) +{ + // Simply swap the two input buffers + mul_S16_U8_S16_n(input2_ptr, input1_ptr, output_ptr, n); +} +} // namespace + +NEPixelWiseMultiplicationKernel::NEPixelWiseMultiplicationKernel() + : _func_float(nullptr), _func_int(nullptr), _func_q_int(nullptr), _input1(nullptr), _input2(nullptr), _output(nullptr), _scale{ 0 }, _scale_exponent{ 0 } +{ +} + +void NEPixelWiseMultiplicationKernel::configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::S16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), + "Output can only be U8 if both inputs are U8"); + if(output->info()->data_type() == DataType::QS8 || input1->info()->data_type() == DataType::QS8 || output->info()->data_type() == DataType::QS8) + { + // All data types must be QS8 + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output); + } + + _input1 = input1; + _input2 = input2; + _output = output; + _scale = scale; + _scale_exponent = 0; + _func_int = nullptr; + _func_q_int = nullptr; + _func_float = nullptr; + + bool is_scale_255 = false; + // Check and validate scaling factor + if(std::abs(scale - scale255_constant) < 0.00001f) + { + ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_NEAREST_UP && rounding_policy != RoundingPolicy::TO_NEAREST_EVEN); + ARM_COMPUTE_UNUSED(rounding_policy); + + is_scale_255 = true; + } + else + { + ARM_COMPUTE_ERROR_ON(rounding_policy != RoundingPolicy::TO_ZERO); + ARM_COMPUTE_UNUSED(rounding_policy); + + int exponent = 0; + const float normalized_mantissa = std::frexp(scale, &exponent); + + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + { + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + _scale_exponent = std::abs(exponent - 1); + } + else + { + ARM_COMPUTE_ERROR("Scale value not supported (Should be 1/(2^n) or 1/255"); + } + } + + const DataType dt_input1 = input1->info()->data_type(); + const DataType dt_input2 = input2->info()->data_type(); + const DataType dt_output = output->info()->data_type(); + const bool is_sat = (overflow_policy == ConvertPolicy::SATURATE); + + if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::U8 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_U8_n : &mul_U8_U8_U8_n; + } + else + { + _func_int = is_sat ? &mul_U8_U8_U8_n : &mul_U8_U8_U8_n; + } + } + else if(DataType::S16 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_S16_S16_S16_n : &mul_S16_S16_S16_n; + } + else + { + _func_int = is_sat ? &mul_S16_S16_S16_n : &mul_S16_S16_S16_n; + } + } + else if(DataType::S16 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_S16_U8_S16_n : &mul_S16_U8_S16_n; + } + else + { + _func_int = is_sat ? &mul_S16_U8_S16_n : &mul_S16_U8_S16_n; + } + } + else if(DataType::U8 == dt_input1 && DataType::S16 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_S16_S16_n : &mul_U8_S16_S16_n; + } + else + { + _func_int = is_sat ? &mul_U8_S16_S16_n : &mul_U8_S16_S16_n; + } + } + else if(DataType::U8 == dt_input1 && DataType::U8 == dt_input2 && DataType::S16 == dt_output) + { + if(is_scale_255) + { + _func_int = is_sat ? &mul_U8_U8_S16_n : &mul_U8_U8_S16_n; + } + else + { + _func_int = is_sat ? &mul_U8_U8_S16_n : &mul_U8_U8_S16_n; + } + } + else if(DataType::QS8 == dt_input1 && DataType::QS8 == dt_input2 && DataType::QS8 == dt_output) + { + if(is_scale_255) + { + _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n : &mul_QS8_QS8_QS8_n; + } + else + { + _func_q_int = is_sat ? &mul_QS8_QS8_QS8_n : &mul_QS8_QS8_QS8_n; + } + } + else if(DataType::F32 == dt_input1 && DataType::F32 == dt_input2 && DataType::F32 == dt_output) + { + _func_float = &mul_F32_F32_F32_n; + _func_int = nullptr; + } + else + { + ARM_COMPUTE_ERROR("You called with the wrong img formats"); + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*input1->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input1->info(), 0, num_elems_processed_per_iteration), + AccessWindowHorizontal(input2->info(), 0, num_elems_processed_per_iteration), + output_access); + + ValidRegion valid_region = intersect_valid_regions(input1->info()->valid_region(), + input2->info()->valid_region()); + + output_access.set_valid_region(win, valid_region); + + INEKernel::configure(win); +} + +void NEPixelWiseMultiplicationKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Iterator input1(_input1, window); + Iterator input2(_input2, window); + Iterator output(_output, window); + + if(_func_int != nullptr) + { + execute_window_loop(window, [&](const Coordinates & id) + { + (*_func_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent); + }, + input1, input2, output); + } + else if(_func_q_int != nullptr) + { + int fixed_point_position = _input1->info()->fixed_point_position(); + execute_window_loop(window, [&](const Coordinates & id) + { + (*_func_q_int)(input1.ptr(), input2.ptr(), output.ptr(), _scale_exponent, fixed_point_position); + }, + input1, input2, output); + } + else + { + ARM_COMPUTE_ERROR_ON(_func_float == nullptr); + execute_window_loop(window, [&](const Coordinates & id) + { + (*_func_float)(input1.ptr(), input2.ptr(), output.ptr(), _scale); + }, + input1, input2, output); + } +} diff --git a/src/core/NEON/kernels/NEPoolingLayerKernel.cpp b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp new file mode 100644 index 0000000000..30b67b64b9 --- /dev/null +++ b/src/core/NEON/kernels/NEPoolingLayerKernel.cpp @@ -0,0 +1,415 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/FixedPoint.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include +#include +#include + +using namespace arm_compute; + +namespace +{ +inline float calculate_avg_scale(const Coordinates &id, const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = id.x() * stride_x - pad_x; + int start_y = id.y() * stride_y - pad_y; + int end_x = std::min(start_x + pool_size, upper_bound_w); + int end_y = std::min(start_y + pool_size, upper_bound_h); + return 1.f / ((end_y - start_y) * (end_x - start_x)); +} + +inline qint8_t calculate_avg_scale_q8(const Coordinates &id, int pool_size, int upper_bound_w, int upper_bound_h, + int pad_x, int pad_y, int stride_x, int stride_y, int fixed_point_position) +{ + static std::array scale_values_q8 = + { { 0x0, 0x0, 0x40, 0x2A, 0x20, 0x19, 0x15, 0x12, 0x10, 0xE } }; + const int start_x = id.x() * stride_x - pad_x; + const int start_y = id.y() * stride_y - pad_y; + const int end_x = std::min(start_x + pool_size, upper_bound_w); + const int end_y = std::min(start_y + pool_size, upper_bound_h); + const int val = ((end_y - start_y) * (end_x - start_x)); + return scale_values_q8[val] >> (7 - fixed_point_position); +} +} // namespace + +NEPoolingLayerKernel::NEPoolingLayerKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _pool_info(), _num_elems_processed_per_iteration(0), _border_size(0) +{ +} + +BorderSize NEPoolingLayerKernel::border_size() const +{ + return _border_size; +} + +void NEPoolingLayerKernel::configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info) +{ + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + PoolingType pool_type = pool_info.pool_type(); + int pool_size = pool_info.pool_size(); + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info(); + DimensionRoundingType pool_round = pad_stride_info.round(); + std::tie(pool_pad_x, pool_pad_y) = pad_stride_info.pad(); + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QS8, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + ARM_COMPUTE_ERROR_ON(2 != pool_size && 3 != pool_size); + ARM_COMPUTE_ERROR_ON(pool_pad_x >= pool_size || pool_pad_y >= pool_size); + ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_type == PoolingType::AVG && input->info()->fixed_point_position() > 6); + ARM_COMPUTE_ERROR_ON(input->info()->data_type() == DataType::QS8 && pool_stride_x > 2); + + // Check output dimensions + std::tie(pooled_w, pooled_h) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), + pool_size, pool_stride_x, pool_stride_y, + pool_pad_x, pool_pad_y, pool_round); + ARM_COMPUTE_UNUSED(pooled_w); + ARM_COMPUTE_UNUSED(pooled_h); + ARM_COMPUTE_ERROR_ON((output->info()->dimension(0) != pooled_w) || (output->info()->dimension(1) != pooled_h)); + + unsigned int num_elems_read_per_iteration = 0; + unsigned int num_elems_processed_per_iteration = 0; + unsigned int num_elems_horizontal_window = 0; + + // Select element size + switch(input->info()->data_type()) + { + case DataType::QS8: + num_elems_read_per_iteration = 16; + num_elems_processed_per_iteration = (pool_size == 2) ? 8 : 7; + num_elems_horizontal_window = 8; + break; + case DataType::F32: + num_elems_read_per_iteration = (pool_size == 2) ? 2 : 4; // We use vload4 for pooling3 + num_elems_processed_per_iteration = 1; + num_elems_horizontal_window = 1; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } + + _num_elems_processed_per_iteration = num_elems_processed_per_iteration; + const int input_width = input->info()->dimension(0); + const int input_height = input->info()->dimension(1); + const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height; + + // Set instance variables + _input = input; + _output = output; + _pool_info = pool_info; + _border_size = BorderSize(pool_pad_y, pool_pad_x); + _border_size.right = std::max(upper_bound_w, pool_pad_x); + _border_size.bottom = std::max(upper_bound_h, pool_pad_y); + + // Select appropriate function + switch(pool_size) + { + case 2: + if(input->info()->data_type() == DataType::QS8) + { + _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_q8 : &NEPoolingLayerKernel::pooling2_q8; + } + else if(input->info()->data_type() == DataType::F32) + { + _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling2_f32 : &NEPoolingLayerKernel::pooling2_f32; + } + break; + case 3: + if(input->info()->data_type() == DataType::QS8) + { + _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_q8 : &NEPoolingLayerKernel::pooling3_q8; + } + else if(input->info()->data_type() == DataType::F32) + { + _func = (PoolingType::AVG == pool_type) ? &NEPoolingLayerKernel::pooling3_f32 : &NEPoolingLayerKernel::pooling3_f32; + } + break; + default: + ARM_COMPUTE_ERROR("Unsupported pooling size"); + break; + } + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_horizontal_window); + update_window_and_padding(win, input_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + INEKernel::configure(win); +} + +template +void NEPoolingLayerKernel::pooling2_q8(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int fixed_point_position = _input->info()->fixed_point_position(); + constexpr int pool_size = 2; + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x; + const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y; + + const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y))); + const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 1)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto top_data = vld1q_qs8(reinterpret_cast(input_top_ptr + input.offset())); + const auto bottom_data = vld1q_qs8(reinterpret_cast(input_bottom_ptr + input.offset())); + qint8x8_t res = {}; + if(pooling_type == PoolingType::AVG) + { + // Calculate scale + const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position); + const qint8x8_t scale_vec = vdup_n_qs8(scale); + + // Perform pooling + const qint8x16_t sum_data = vqaddq_qs8(top_data, bottom_data); + res = vqmul_qs8(vpadd_s8(vget_low_s8(sum_data), vget_high_s8(sum_data)), scale_vec, fixed_point_position); + } + else + { + const qint8x16_t max_data = vmaxq_s8(top_data, bottom_data); + res = vpmax_s8(vget_low_s8(max_data), vget_high_s8(max_data)); + } + vst1_qs8(reinterpret_cast(output.ptr()), res); + }, + input, output); +} + +template +void NEPoolingLayerKernel::pooling2_f32(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + constexpr int pool_size = 2; + int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x; + const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y; + + const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y))); + const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 1)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x2_t top_data = vld1_f32(reinterpret_cast(input_top_ptr + input.offset())); + const float32x2_t bottom_data = vld1_f32(reinterpret_cast(input_bottom_ptr + input.offset())); + float32x2_t res = {}; + if(pooling_type == PoolingType::AVG) + { + // Calculate scale + float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x2_t sum_data = vadd_f32(top_data, bottom_data); + res = vmul_f32(vpadd_f32(sum_data, sum_data), scale_v); + } + else + { + const float32x2_t max_data = vmax_f32(top_data, bottom_data); + res = vpmax_f32(max_data, max_data); + } + *(reinterpret_cast(output.ptr())) = vget_lane_f32(res, 0); + }, + input, output); +} + +template +void NEPoolingLayerKernel::pooling3_q8(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + const int fixed_point_position = _input->info()->fixed_point_position(); + constexpr int pool_size = 3; + int pool_pad_x = 0; + int pool_pad_y = 0; + int pool_stride_x = 0; + int pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x; + const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y; + + const uint8_t *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y))); + const uint8_t *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 1)); + const uint8_t *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 2)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto top_data = vld1q_qs8(reinterpret_cast(input_top_ptr + input.offset())); + const auto middle_data = vld1q_qs8(reinterpret_cast(input_middle_ptr + input.offset())); + const auto bottom_data = vld1q_qs8(reinterpret_cast(input_bottom_ptr + input.offset())); + qint8x8_t res = {}; + if(pooling_type == PoolingType::AVG) + { + // Calculate scale + const qint8_t scale = calculate_avg_scale_q8(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y, fixed_point_position); + const qint8x8_t scale_vec = vdup_n_qs8(scale); + + // Perform pooling for stride 2 + const qint8x16_t sum_data = vqaddq_qs8(vqaddq_qs8(top_data, bottom_data), middle_data); + const qint8x16_t sum_data2 = vextq_s8(sum_data, sum_data, 1); + const qint8x16_t sum_data3 = vextq_s8(sum_data, sum_data, 2); + const qint8x16_t final_sum = vqaddq_qs8(vqaddq_qs8(sum_data, sum_data2), sum_data3); + if(pool_stride_x == 2) + { + const qint8x8x2_t table = { { vget_low_s8(final_sum), vget_high_s8(final_sum) } }; + static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; + res = vtbl2_s8(table, lookup_val); + } + else + { + res = vget_low_s8(final_sum); + } + res = vqmul_qs8(res, scale_vec, fixed_point_position); + } + else + { + const qint8x16_t max_data = vmaxq_s8(vmaxq_s8(top_data, bottom_data), middle_data); + const qint8x16_t max_data2 = vextq_s8(max_data, max_data, 1); + const qint8x16_t max_data3 = vextq_s8(max_data, max_data, 2); + const qint8x16_t final_max = vmaxq_s8(vmaxq_s8(max_data, max_data2), max_data3); + + if(pool_stride_x == 2) + { + const qint8x8x2_t table = { { vget_low_s8(final_max), vget_high_s8(final_max) } }; + static const qint8x8_t lookup_val = { 0, 2, 4, 6, 8, 10, 12, 14 }; + res = vtbl2_s8(table, lookup_val); + } + else + { + res = vget_low_s8(final_max); + } + } + vst1_qs8(reinterpret_cast(output.ptr()), res); + }, + input, output); +} + +template +void NEPoolingLayerKernel::pooling3_f32(const Window &window_input, const Window &window) +{ + Iterator input(_input, window_input); + Iterator output(_output, window); + + constexpr const int pool_size = 3; + int pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y = 0; + std::tie(pool_pad_x, pool_pad_y) = _pool_info.pad_stride_info().pad(); + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + const int upper_bound_w = _input->info()->dimension(0) + pool_pad_x; + const int upper_bound_h = _input->info()->dimension(1) + pool_pad_y; + + const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y))); + const unsigned char *const input_middle_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 1)); + const unsigned char *const input_bottom_ptr = _input->ptr_to_element(Coordinates(-static_cast(pool_pad_x), -static_cast(pool_pad_y) + 2)); + + execute_window_loop(window, [&](const Coordinates & id) + { + const float32x4_t top_data = vld1q_f32(reinterpret_cast(input_top_ptr + input.offset())); + const float32x4_t middle_data = vld1q_f32(reinterpret_cast(input_middle_ptr + input.offset())); + const float32x4_t bottom_data = vld1q_f32(reinterpret_cast(input_bottom_ptr + input.offset())); + float32x2_t res = {}; + if(pooling_type == PoolingType::AVG) + { + // Calculate scale + float scale = calculate_avg_scale(id, pool_size, upper_bound_w, upper_bound_h, pool_pad_x, pool_pad_y, pool_stride_x, pool_stride_y); + const float32x2_t scale_v = vdup_n_f32(scale); + + // Perform pooling + const float32x4_t sum_data = vaddq_f32(vaddq_f32(top_data, bottom_data), middle_data); + res = vpadd_f32(vget_high_f32(vsetq_lane_f32(0.f, sum_data, 3)), vget_low_f32(sum_data)); + res = vmul_f32(vpadd_f32(res, res), scale_v); + } + else + { + const float32x4_t max_data = vmaxq_f32(vmaxq_f32(top_data, bottom_data), middle_data); + res = vpmax_f32(vget_high_f32(vsetq_lane_f32(-std::numeric_limits::max(), max_data, 3)), vget_low_f32(max_data)); + res = vpmax_f32(res, res); + } + *(reinterpret_cast(output.ptr())) = vget_lane_f32(res, 0); + }, + input, output); +} + +void NEPoolingLayerKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + unsigned int pool_stride_x, pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info().stride(); + + // Set step for input in x and y direction for the input + Window window_input(window); + unsigned int window_x_inc = 0; + if(_input->info()->data_type() == DataType::QS8) + { + window_x_inc = (pool_stride_x == 2) ? _num_elems_processed_per_iteration * 2 : _num_elems_processed_per_iteration; + } + else + { + window_x_inc = pool_stride_x; + } + window_input.set(Window::DimX, Window::Dimension(window.x().start() * pool_stride_x, window.x().end() * pool_stride_x, window_x_inc)); + window_input.set(Window::DimY, Window::Dimension(window.y().start() * pool_stride_y, window.y().end() * pool_stride_y, pool_stride_y)); + + // Run function + (this->*_func)(window_input, window); +} diff --git a/src/core/NEON/kernels/NERemapKernel.cpp b/src/core/NEON/kernels/NERemapKernel.cpp new file mode 100644 index 0000000000..c3c44a5f32 --- /dev/null +++ b/src/core/NEON/kernels/NERemapKernel.cpp @@ -0,0 +1,226 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NERemapKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +inline int32x4_t offset_nearest_interpolation(const float *mapx_ptr, const float *mapy_ptr, const float32x4_t &width, const float32x4_t &height, const int32x4_t &stride) +{ + static const float32x4_t lowerxy = vdupq_n_f32(-1.0f); + + float32x4_t x = vld1q_f32(mapx_ptr); + float32x4_t y = vld1q_f32(mapy_ptr); + + // Clamp x coordinates + x = vmaxq_f32(lowerxy, vminq_f32(x, width)); + y = vmaxq_f32(lowerxy, vminq_f32(y, height)); + + const int32x4_t x_s32 = vcvtq_s32_f32(x); + const int32x4_t y_s32 = vcvtq_s32_f32(y); + + return vmlaq_s32(x_s32, y_s32, stride); +} + +} // namespace + +NERemapKernel::NERemapKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _map_x(nullptr), _map_y(nullptr) +{ +} + +void NERemapKernel::configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_x, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(map_y, 1, DataType::F32); + + _input = input; + _output = output; + _map_x = map_x; + _map_y = map_y; + + switch(policy) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + _func = &NERemapKernel::remap_nearest; + break; + } + case InterpolationPolicy::BILINEAR: + { + _func = &NERemapKernel::remap_bilinear; + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + break; + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic output_access(output->info(), 0, 0, output->info()->dimension(0), output->info()->dimension(1)); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, 1), + AccessWindowRectangle(map_x->info(), 0, 0, num_elems_processed_per_iteration, 1), + AccessWindowRectangle(map_y->info(), 0, 0, num_elems_processed_per_iteration, 1), + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NERemapKernel::remap_nearest(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + Iterator mapx(_map_x, window); + Iterator mapy(_map_y, window); + + const float32x4_t width = vdupq_n_f32(static_cast(_input->info()->dimension(0))); + const float32x4_t height = vdupq_n_f32(static_cast(_input->info()->dimension(1))); + const int32x4_t in_stride = vdupq_n_s32(static_cast(_input->info()->strides_in_bytes()[1])); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto mapx_ptr = reinterpret_cast(mapx.ptr()); + const auto mapy_ptr = reinterpret_cast(mapy.ptr()); + const uint8_t *in_ptr = in.ptr(); + + const int32x4_t offset0 = offset_nearest_interpolation(mapx_ptr + 0, mapy_ptr + 0, width, height, in_stride); + const int32x4_t offset1 = offset_nearest_interpolation(mapx_ptr + 4, mapy_ptr + 4, width, height, in_stride); + const int32x4_t offset2 = offset_nearest_interpolation(mapx_ptr + 8, mapy_ptr + 8, width, height, in_stride); + const int32x4_t offset3 = offset_nearest_interpolation(mapx_ptr + 12, mapy_ptr + 12, width, height, in_stride); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 0)], tmp0, 0); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 1)], tmp0, 1); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 2)], tmp0, 2); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset0, 3)], tmp0, 3); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 0)], tmp0, 4); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 1)], tmp0, 5); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 2)], tmp0, 6); + tmp0 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset1, 3)], tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 0)], tmp1, 0); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 1)], tmp1, 1); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 2)], tmp1, 2); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset2, 3)], tmp1, 3); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 0)], tmp1, 4); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 1)], tmp1, 5); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 2)], tmp1, 6); + tmp1 = vset_lane_u8(in_ptr[vgetq_lane_s32(offset3, 3)], tmp1, 7); + + vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); + }, + in, out, mapx, mapy); +} + +void NERemapKernel::remap_bilinear(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + Iterator mapx(_map_x, window); + Iterator mapy(_map_y, window); + + const size_t width = _input->info()->dimension(0); + const size_t height = _input->info()->dimension(1); + const size_t in_stride = _input->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto mapx_ptr = reinterpret_cast(mapx.ptr()); + const auto mapy_ptr = reinterpret_cast(mapy.ptr()); + const uint8_t *in_ptr = in.ptr(); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[0], mapy_ptr[0]), tmp0, 0); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[1], mapy_ptr[1]), tmp0, 1); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[2], mapy_ptr[2]), tmp0, 2); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[3], mapy_ptr[3]), tmp0, 3); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[4], mapy_ptr[4]), tmp0, 4); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[5], mapy_ptr[5]), tmp0, 5); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[6], mapy_ptr[6]), tmp0, 6); + tmp0 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[7], mapy_ptr[7]), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[8], mapy_ptr[8]), tmp1, 0); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[9], mapy_ptr[9]), tmp1, 1); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[10], mapy_ptr[10]), tmp1, 2); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[11], mapy_ptr[11]), tmp1, 3); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[12], mapy_ptr[12]), tmp1, 4); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[13], mapy_ptr[13]), tmp1, 5); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[14], mapy_ptr[14]), tmp1, 6); + tmp1 = vset_lane_u8(pixel_bilinear_c1u8_clamp(in_ptr, in_stride, width, height, mapx_ptr[15], mapy_ptr[15]), tmp1, 7); + + vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); + }, + in, out, mapx, mapy); +} + +void NERemapKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEScaleKernel.cpp b/src/core/NEON/kernels/NEScaleKernel.cpp new file mode 100644 index 0000000000..fd2978de1c --- /dev/null +++ b/src/core/NEON/kernels/NEScaleKernel.cpp @@ -0,0 +1,359 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEScaleKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +NEScaleKernel::NEScaleKernel() + : _func(nullptr), _offsets(nullptr), _dx(nullptr), _dy(nullptr), _input(nullptr), _output(nullptr) +{ +} + +BorderSize NEScaleKernel::border_size() const +{ + return BorderSize(1); +} + +void NEScaleKernel::configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + + if(policy == InterpolationPolicy::NEAREST_NEIGHBOR) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); + } + + if(policy == InterpolationPolicy::BILINEAR) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(offsets, 1, DataType::S32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dx, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dy, 1, DataType::F32); + } + + ARM_COMPUTE_ERROR_ON(output->info()->dimension(0) == 0); + ARM_COMPUTE_ERROR_ON(output->info()->dimension(1) == 0); + + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_ERROR_ON(input->info()->dimension(i) != output->info()->dimension(i)); + } + + _input = input; + _output = output; + _offsets = offsets; + _dx = dx; + _dy = dy; + + switch(policy) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + { + _func = &NEScaleKernel::scale_nearest; + break; + } + case InterpolationPolicy::BILINEAR: + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dx, 1, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_dy, 1, DataType::F32); + + _func = &NEScaleKernel::scale_bilinear; + break; + } + case InterpolationPolicy::AREA: + { + _func = &NEScaleKernel::scale_area; + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported interpolation mode"); + } + + constexpr unsigned int num_elems_processed_per_iteration = 16; + const int border_offset = (border_undefined) ? 0 : border_size().left; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowStatic input_access(input->info(), -border_offset, -border_offset, input->info()->dimension(0) + border_offset, input->info()->dimension(1) + border_offset); + AccessWindowHorizontal offsets_access(offsets->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal dx_access(dx == nullptr ? nullptr : dx->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal dy_access(dy == nullptr ? nullptr : dy->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, + input_access, + offsets_access, + dx_access, + dy_access, + output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NEScaleKernel::scale_nearest(const Window &window) +{ + const size_t input_stride = _input->info()->strides_in_bytes()[1]; + + // Compute the ratio between source height and destination height + const auto hr = static_cast(_input->info()->dimension(1)) / static_cast(_output->info()->dimension(1)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_off; + win_off.set(Window::DimX, window[Window::DimX]); + win_off.set(Window::DimY, window[Window::DimY]); + + for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(_input, win_in); + Iterator out(_output, window); + Iterator offsets(_offsets, win_off); + + switch(_input->info()->data_type()) + { + case DataType::U8: + { + uint8x16_t tmp = vdupq_n_u8(0); + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + const uint8_t *const in_ptr = in.ptr(); + + const size_t in_yi = (id.y() + 0.5f) * hr; + const size_t offset_row = in_yi * input_stride; + + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[0] + offset_row], tmp, 0); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[1] + offset_row], tmp, 1); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[2] + offset_row], tmp, 2); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[3] + offset_row], tmp, 3); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[4] + offset_row], tmp, 4); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[5] + offset_row], tmp, 5); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[6] + offset_row], tmp, 6); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[7] + offset_row], tmp, 7); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[8] + offset_row], tmp, 8); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[9] + offset_row], tmp, 9); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[10] + offset_row], tmp, 10); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[11] + offset_row], tmp, 11); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[12] + offset_row], tmp, 12); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[13] + offset_row], tmp, 13); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[14] + offset_row], tmp, 14); + tmp = vsetq_lane_u8(in_ptr[offsets_ptr[15] + offset_row], tmp, 15); + + vst1q_u8(out.ptr(), tmp); + }, + in, offsets, out); + break; + } + case DataType::S16: + { + int16x8x2_t tmp = + { + { + vdupq_n_s16(0), + vdupq_n_s16(0) + } + }; + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + + const size_t in_yi = (id.y() + 0.5f) * hr; + const size_t offset_row = in_yi * input_stride; + + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[0] + offset_row), tmp.val[0], 0); + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[2] + offset_row), tmp.val[0], 1); + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[4] + offset_row), tmp.val[0], 2); + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[6] + offset_row), tmp.val[0], 3); + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[8] + offset_row), tmp.val[0], 4); + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[10] + offset_row), tmp.val[0], 5); + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[12] + offset_row), tmp.val[0], 6); + tmp.val[0] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[14] + offset_row), tmp.val[0], 7); + + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[1] + offset_row), tmp.val[1], 0); + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[3] + offset_row), tmp.val[1], 1); + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[5] + offset_row), tmp.val[1], 2); + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[7] + offset_row), tmp.val[1], 3); + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[9] + offset_row), tmp.val[1], 4); + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[11] + offset_row), tmp.val[1], 5); + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[13] + offset_row), tmp.val[1], 6); + tmp.val[1] = vsetq_lane_s16(*reinterpret_cast(in.ptr() + offsets_ptr[15] + offset_row), tmp.val[1], 7); + + vst2q_s16(reinterpret_cast(out.ptr()), tmp); + }, + in, offsets, out); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported"); + break; + } +} + +void NEScaleKernel::scale_bilinear(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8); + + // Compute the ratio between source height and destination height + const auto hr = static_cast(_input->info()->dimension(1)) / static_cast(_output->info()->dimension(1)); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Window win_off; + win_off.set(Window::DimX, window.x()); + win_off.set(Window::DimY, window.y()); + + for(size_t d = Window::DimZ; d < _offsets->info()->num_dimensions(); ++d) + { + win_off.set(d, Window::Dimension(0, 0, 0)); + } + + Iterator in(_input, win_in); + Iterator out(_output, window); + Iterator offsets(_offsets, win_off); + Iterator dx(_dx, win_off); + Iterator dy(_dy, win_off); + + /* Input image stride */ + const size_t in_stride = _input->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto offsets_ptr = reinterpret_cast(offsets.ptr()); + const auto dx_ptr = reinterpret_cast(dx.ptr()); + const auto dy_ptr = reinterpret_cast(dy.ptr()); + const auto in_ptr = reinterpret_cast(in.ptr()); + + const size_t in_yi = std::floor((id.y() + 0.5f) * hr - 0.5f); + const size_t offset_row = in_yi * in_stride; + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[0] + offset_row], in_stride, dx_ptr[0], dy_ptr[0]), tmp0, 0); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[1] + offset_row], in_stride, dx_ptr[1], dy_ptr[1]), tmp0, 1); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[2] + offset_row], in_stride, dx_ptr[2], dy_ptr[2]), tmp0, 2); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[3] + offset_row], in_stride, dx_ptr[3], dy_ptr[3]), tmp0, 3); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[4] + offset_row], in_stride, dx_ptr[4], dy_ptr[4]), tmp0, 4); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[5] + offset_row], in_stride, dx_ptr[5], dy_ptr[5]), tmp0, 5); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[6] + offset_row], in_stride, dx_ptr[6], dy_ptr[6]), tmp0, 6); + tmp0 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[7] + offset_row], in_stride, dx_ptr[7], dy_ptr[7]), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[8] + offset_row], in_stride, dx_ptr[8], dy_ptr[8]), tmp1, 0); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[9] + offset_row], in_stride, dx_ptr[9], dy_ptr[9]), tmp1, 1); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[10] + offset_row], in_stride, dx_ptr[10], dy_ptr[10]), tmp1, 2); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[11] + offset_row], in_stride, dx_ptr[11], dy_ptr[11]), tmp1, 3); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[12] + offset_row], in_stride, dx_ptr[12], dy_ptr[12]), tmp1, 4); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[13] + offset_row], in_stride, dx_ptr[13], dy_ptr[13]), tmp1, 5); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[14] + offset_row], in_stride, dx_ptr[14], dy_ptr[14]), tmp1, 6); + tmp1 = vset_lane_u8(delta_bilinear_c1u8(&in_ptr[offsets_ptr[15] + offset_row], in_stride, dx_ptr[15], dy_ptr[15]), tmp1, 7); + + vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); + }, + in, offsets, dx, dy, out); +} + +void NEScaleKernel::scale_area(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(_input, 1, DataType::U8); + + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + const auto wr = static_cast(_input->info()->dimension(0)) / static_cast(_output->info()->dimension(0)); + const auto hr = static_cast(_input->info()->dimension(1)) / static_cast(_output->info()->dimension(1)); + const auto w = _input->info()->dimension(0); + const auto h = _input->info()->dimension(1); + const size_t in_stride = _input->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(in.ptr()); + + uint8x8_t tmp0 = vdup_n_u8(0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x(), id.y()), tmp0, 0); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 1, id.y()), tmp0, 1); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 2, id.y()), tmp0, 2); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 3, id.y()), tmp0, 3); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 4, id.y()), tmp0, 4); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 5, id.y()), tmp0, 5); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 6, id.y()), tmp0, 6); + tmp0 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 7, id.y()), tmp0, 7); + + uint8x8_t tmp1 = vdup_n_u8(0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 8, id.y()), tmp1, 0); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 9, id.y()), tmp1, 1); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 10, id.y()), tmp1, 2); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 11, id.y()), tmp1, 3); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 12, id.y()), tmp1, 4); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 13, id.y()), tmp1, 5); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 14, id.y()), tmp1, 6); + tmp1 = vset_lane_u8(pixel_area_c1u8_clamp(in_ptr, in_stride, w, h, wr, hr, id.x() + 15, id.y()), tmp1, 7); + + vst1q_u8(out.ptr(), vcombine_u8(tmp0, tmp1)); + }, + in, out); +} + +void NEScaleKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEScharr3x3Kernel.cpp b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp new file mode 100644 index 0000000000..183df1efcb --- /dev/null +++ b/src/core/NEON/kernels/NEScharr3x3Kernel.cpp @@ -0,0 +1,259 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace +{ +const int16x8_t three = vdupq_n_s16(3); +const int16x8_t minus_three = vdupq_n_s16(-3); +const int16x8_t ten = vdupq_n_s16(10); +const int16x8_t minus_ten = vdupq_n_s16(-10); + +inline int16x8_t scharr_y(const int16x8x2_t &top, const int16x8x2_t &bottom) +{ + // Top left + int16x8_t out = vmulq_s16(top.val[0], minus_three); + // Top center + out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 1), minus_ten); + // Top right + out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), minus_three); + + // Bottom left + out = vmlaq_s16(out, bottom.val[0], three); + // Bottom center + out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 1), ten); + // Bottom right + out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three); + + return out; +} + +inline int16x8_t scharr_x(const int16x8x2_t &top, const int16x8x2_t &middle, const int16x8x2_t &bottom) +{ + // Top left + int16x8_t out = vmulq_s16(top.val[0], minus_three); + // Top right + out = vmlaq_s16(out, vextq_s16(top.val[0], top.val[1], 2), three); + + // Middle left + out = vmlaq_s16(out, middle.val[0], minus_ten); + // Middle right + out = vmlaq_s16(out, vextq_s16(middle.val[0], middle.val[1], 2), ten); + + // Bottom left + out = vmlaq_s16(out, bottom.val[0], minus_three); + // Bottom right + out = vmlaq_s16(out, vextq_s16(bottom.val[0], bottom.val[1], 2), three); + + return out; +} +} // namespace + +NEScharr3x3Kernel::NEScharr3x3Kernel() + : _run_scharr_x(false), _run_scharr_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr) +{ +} + +void NEScharr3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_scharr_x = output_x != nullptr; + _run_scharr_y = output_y != nullptr; + + if(_run_scharr_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_scharr_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +BorderSize NEScharr3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void NEScharr3x3Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1)); + const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0)); + const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, +1)); + + Iterator input(_input, window); + Iterator output_y; + Iterator output_x; + + if(_run_scharr_y) + { + output_y = Iterator(_output_y, window); + } + + if(_run_scharr_x) + { + output_x = Iterator(_output_x, window); + } + + if(_run_scharr_x && _run_scharr_y) + { + execute_window_loop(window, [&](const Coordinates & id) + { + + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + vst1q_s16(reinterpret_cast(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16)); + vst1q_s16(reinterpret_cast(output_y.ptr()), scharr_y(top_s16, bot_s16)); + }, + input, output_x, output_y); + } + else if(_run_scharr_x) + { + execute_window_loop(window, [&](const Coordinates & id) + { + + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + vst1q_s16(reinterpret_cast(output_x.ptr()), scharr_x(top_s16, mid_s16, bot_s16)); + }, + input, output_x); + } + else if(_run_scharr_y) + { + execute_window_loop(window, [&](const Coordinates & id) + { + + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + vst1q_s16(reinterpret_cast(output_y.ptr()), scharr_y(top_s16, bot_s16)); + }, + input, output_y); + } +} diff --git a/src/core/NEON/kernels/NESobel3x3Kernel.cpp b/src/core/NEON/kernels/NESobel3x3Kernel.cpp new file mode 100644 index 0000000000..ab08a1cfeb --- /dev/null +++ b/src/core/NEON/kernels/NESobel3x3Kernel.cpp @@ -0,0 +1,269 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +NESobel3x3Kernel::NESobel3x3Kernel() + : _run_sobel_x(false), _run_sobel_y(false), _input(nullptr), _output_x(nullptr), _output_y(nullptr) +{ +} + +BorderSize NESobel3x3Kernel::border_size() const +{ + return BorderSize(1); +} + +void NESobel3x3Kernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 3; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), -border_size().left, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NESobel3x3Kernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + const unsigned char *const input_top_ptr = _input->ptr_to_element(Coordinates(-1, -1)); + const unsigned char *const input_mid_ptr = _input->ptr_to_element(Coordinates(-1, 0)); + const unsigned char *const input_bot_ptr = _input->ptr_to_element(Coordinates(-1, 1)); + + Iterator input(_input, window); + Iterator output_y; + Iterator output_x; + + if(_run_sobel_y) + { + output_y = Iterator(_output_y, window); + } + + if(_run_sobel_x) + { + output_x = Iterator(_output_x, window); + } + + static const int16x8_t two = vdupq_n_s16(2); + static const int16x8_t minustwo = vdupq_n_s16(-2); + + if(_run_sobel_y && _run_sobel_x) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //SOBEL Y + //top left + int16x8_t out_y = vnegq_s16(top_s16.val[0]); + //top mid + out_y = vmlaq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo); + //top right + out_y = vsubq_s16(out_y, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //bot left + out_y = vaddq_s16(out_y, bot_s16.val[0]); + //bot mid + out_y = vmlaq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two); + //bot right + out_y = vaddq_s16(out_y, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1q_s16(reinterpret_cast(output_y.ptr()), out_y); + + //SOBEL X + //top left + int16x8_t out_x = vnegq_s16(top_s16.val[0]); + //top right + out_x = vaddq_s16(out_x, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //mid left + out_x = vmlaq_s16(out_x, mid_s16.val[0], minustwo); + //mid right + out_x = vmlaq_s16(out_x, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two); + //bot left + out_x = vsubq_s16(out_x, bot_s16.val[0]); + //bot right + out_x = vaddq_s16(out_x, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1q_s16(reinterpret_cast(output_x.ptr()), out_x); + }, + input, output_x, output_y); + } + else if(_run_sobel_x) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t mid_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(mid_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(mid_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //SOBEL X + //top left + int16x8_t out = vnegq_s16(top_s16.val[0]); + //top right + out = vaddq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //mid left + out = vmlaq_s16(out, mid_s16.val[0], minustwo); + //mid right + out = vmlaq_s16(out, vextq_s16(mid_s16.val[0], mid_s16.val[1], 2), two); + //bot left + out = vsubq_s16(out, bot_s16.val[0]); + //bot right + out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1q_s16(reinterpret_cast(output_x.ptr()), out); + }, + input, output_x); + } + else if(_run_sobel_y) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t top_data = vld1q_u8(input_top_ptr + input.offset()); + const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset()); + + const int16x8x2_t top_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(top_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(top_data))) + } + }; + const int16x8x2_t bot_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(bot_data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(bot_data))) + } + }; + + //SOBEL Y + //top left + int16x8_t out = vnegq_s16(top_s16.val[0]); + //top mid + out = vmlaq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 1), minustwo); + //top right + out = vsubq_s16(out, vextq_s16(top_s16.val[0], top_s16.val[1], 2)); + //bot left + out = vaddq_s16(out, bot_s16.val[0]); + //bot mid + out = vmlaq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 1), two); + //bot right + out = vaddq_s16(out, vextq_s16(bot_s16.val[0], bot_s16.val[1], 2)); + + vst1q_s16(reinterpret_cast(output_y.ptr()), out); + }, + input, output_y); + } +} diff --git a/src/core/NEON/kernels/NESobel5x5Kernel.cpp b/src/core/NEON/kernels/NESobel5x5Kernel.cpp new file mode 100644 index 0000000000..488eee1176 --- /dev/null +++ b/src/core/NEON/kernels/NESobel5x5Kernel.cpp @@ -0,0 +1,402 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h" + +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +NESobel5x5HorKernel::NESobel5x5HorKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) +{ +} + +BorderSize NESobel5x5HorKernel::border_size() const +{ + return _border_size; +} + +void NESobel5x5HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_x, 1, DataType::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_y, 1, DataType::S16); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + _border_size = BorderSize(border_undefined ? 0 : 2, 2); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration), + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NESobel5x5HorKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Window win_in(window); + win_in.shift(Window::DimX, -2); + + Iterator input(_input, win_in); + Iterator output_x; + Iterator output_y; + + if(_run_sobel_x) + { + output_x = Iterator(_output_x, window); + } + + if(_run_sobel_y) + { + output_y = Iterator(_output_y, window); + } + + if(_run_sobel_y && _run_sobel_x) + { + static const int16x8_t six = vdupq_n_s16(6); + static const int16x8_t four = vdupq_n_s16(4); + static const int16x8_t two = vdupq_n_s16(2); + static const int16x8_t minustwo = vdupq_n_s16(-2); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + int16x8_t out_y = data_s16.val[0]; + out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four); + out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six); + out_y = vmlaq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four); + out_y = vaddq_s16(out_y, vextq_s16(data_s16.val[0], data_s16.val[1], 4)); + + vst1q_s16(reinterpret_cast(output_y.ptr()), out_y); + + int16x8_t out_x = vnegq_s16(data_s16.val[0]); + out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo); + out_x = vmlaq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two); + out_x = vaddq_s16(out_x, vextq_s16(data_s16.val[0], data_s16.val[1], 4)); + + vst1q_s16(reinterpret_cast(output_x.ptr()), out_x); + }, + input, output_x, output_y); + } + else if(_run_sobel_x) + { + static const int16x8_t two = vdupq_n_s16(2); + static const int16x8_t minustwo = vdupq_n_s16(-2); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + int16x8_t out = vnegq_s16(data_s16.val[0]); + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), minustwo); + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), two); + out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4)); + + vst1q_s16(reinterpret_cast(output_x.ptr()), out); + }, + input, output_x); + } + else if(_run_sobel_y) + { + static const int16x8_t six = vdupq_n_s16(6); + static const int16x8_t four = vdupq_n_s16(4); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + const int16x8x2_t data_s16 = + { + { + vreinterpretq_s16_u16(vmovl_u8(vget_low_u8(data))), + vreinterpretq_s16_u16(vmovl_u8(vget_high_u8(data))) + } + }; + + int16x8_t out = data_s16.val[0]; + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 1), four); + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 2), six); + out = vmlaq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 3), four); + out = vaddq_s16(out, vextq_s16(data_s16.val[0], data_s16.val[1], 4)); + + vst1q_s16(reinterpret_cast(output_y.ptr()), out); + }, + input, output_y); + } +} + +NESobel5x5VertKernel::NESobel5x5VertKernel() + : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize NESobel5x5VertKernel::border_size() const +{ + return BorderSize(2, 0); +} + +void NESobel5x5VertKernel::configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S16); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S16); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S16); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S16); + } + + _input_x = input_x; + _input_y = input_y; + _output_x = output_x; + _output_y = output_y; + + const ITensor *const input = _run_sobel_x ? input_x : input_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 16; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 16; + constexpr unsigned int num_rows_read_per_iteration = 5; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NESobel5x5VertKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Iterator input_x; + Iterator input_y; + Iterator output_x; + Iterator output_y; + + const int16_t *input_x_low2_ptr = nullptr; + const int16_t *input_x_low_ptr = nullptr; + const int16_t *input_x_mid_ptr = nullptr; + const int16_t *input_x_top_ptr = nullptr; + const int16_t *input_x_top2_ptr = nullptr; + + const int16_t *input_y_low2_ptr = nullptr; + const int16_t *input_y_low_ptr = nullptr; + const int16_t *input_y_top_ptr = nullptr; + const int16_t *input_y_top2_ptr = nullptr; + + if(_run_sobel_x) + { + input_x = Iterator(_input_x, window); + output_x = Iterator(_output_x, window); + input_x_top2_ptr = reinterpret_cast(_input_x->ptr_to_element(Coordinates(0, -2))); + input_x_top_ptr = reinterpret_cast(_input_x->ptr_to_element(Coordinates(0, -1))); + input_x_mid_ptr = reinterpret_cast(_input_x->ptr_to_element(Coordinates(0, 0))); + input_x_low_ptr = reinterpret_cast(_input_x->ptr_to_element(Coordinates(0, 1))); + input_x_low2_ptr = reinterpret_cast(_input_x->ptr_to_element(Coordinates(0, 2))); + } + + if(_run_sobel_y) + { + input_y = Iterator(_input_y, window); + output_y = Iterator(_output_y, window); + input_y_top2_ptr = reinterpret_cast(_input_y->ptr_to_element(Coordinates(0, -2))); + input_y_top_ptr = reinterpret_cast(_input_y->ptr_to_element(Coordinates(0, -1))); + input_y_low_ptr = reinterpret_cast(_input_y->ptr_to_element(Coordinates(0, 1))); + input_y_low2_ptr = reinterpret_cast(_input_y->ptr_to_element(Coordinates(0, 2))); + } + + static const int16x8_t six = vdupq_n_s16(6); + static const int16x8_t four = vdupq_n_s16(4); + static const int16x8_t two = vdupq_n_s16(2); + static const int16x8_t minustwo = vdupq_n_s16(-2); + + if(_run_sobel_x) + { + execute_window_loop(window, [&](const Coordinates & id) + { + // Convert offset from uint8_t* to uint16_t* + const size_t input_offset_high_s16 = input_x.offset() / 2; + const size_t input_offset_low_s16 = input_offset_high_s16 + 8; + + //HIGH DATA + //top2 + int16x8_t data_high = vld1q_s16(input_x_top2_ptr + input_offset_high_s16); + int16x8_t out_high = data_high; + //top + data_high = vld1q_s16(input_x_top_ptr + input_offset_high_s16); + out_high = vmlaq_s16(out_high, data_high, four); + //mid + data_high = vld1q_s16(input_x_mid_ptr + input_offset_high_s16); + out_high = vmlaq_s16(out_high, data_high, six); + //low + data_high = vld1q_s16(input_x_low_ptr + input_offset_high_s16); + out_high = vmlaq_s16(out_high, data_high, four); + //low2 + data_high = vld1q_s16(input_x_low2_ptr + input_offset_high_s16); + out_high = vaddq_s16(out_high, data_high); + + vst1q_s16((reinterpret_cast(output_x.ptr())), out_high); + + //LOW DATA + //top2 + int16x8_t data_low = vld1q_s16(input_x_top2_ptr + input_offset_low_s16); + int16x8_t out_low = data_low; + //top + data_low = vld1q_s16(input_x_top_ptr + input_offset_low_s16); + out_low = vmlaq_s16(out_low, data_low, four); + //mid + data_low = vld1q_s16(input_x_mid_ptr + input_offset_low_s16); + out_low = vmlaq_s16(out_low, data_low, six); + //low + data_low = vld1q_s16(input_x_low_ptr + input_offset_low_s16); + out_low = vmlaq_s16(out_low, data_low, four); + //low2 + data_low = vld1q_s16(input_x_low2_ptr + input_offset_low_s16); + out_low = vaddq_s16(out_low, data_low); + + vst1q_s16((reinterpret_cast(output_x.ptr())) + 8, out_low); + }, + input_x, output_x); + } + + if(_run_sobel_y) + { + execute_window_loop(window, [&](const Coordinates & id) + { + // Convert offset from uint8_t* to uint16_t* + const size_t input_offset_high_s16 = input_y.offset() / 2; + const size_t input_offset_low_s16 = input_offset_high_s16 + 8; + + //HIGH DATA + //top2 + int16x8_t data_high = vld1q_s16(input_y_top2_ptr + input_offset_high_s16); + int16x8_t out_high = vnegq_s16(data_high); + //top + data_high = vld1q_s16(input_y_top_ptr + input_offset_high_s16); + out_high = vmlaq_s16(out_high, data_high, minustwo); + //low + data_high = vld1q_s16(input_y_low_ptr + input_offset_high_s16); + out_high = vmlaq_s16(out_high, data_high, two); + //low2 + data_high = vld1q_s16(input_y_low2_ptr + input_offset_high_s16); + out_high = vaddq_s16(out_high, data_high); + + vst1q_s16((reinterpret_cast(output_y.ptr())), out_high); + + //LOW DATA + //top2 + int16x8_t data_low = vld1q_s16(input_y_top2_ptr + input_offset_low_s16); + int16x8_t out_low = vnegq_s16(data_low); + //top + data_low = vld1q_s16(input_y_top_ptr + input_offset_low_s16); + out_low = vmlaq_s16(out_low, data_low, minustwo); + //low + data_low = vld1q_s16(input_y_low_ptr + input_offset_low_s16); + out_low = vmlaq_s16(out_low, data_low, two); + //low2 + data_low = vld1q_s16(input_y_low2_ptr + input_offset_low_s16); + out_low = vaddq_s16(out_low, data_low); + + vst1q_s16((reinterpret_cast(output_y.ptr())) + 8, out_low); + }, + input_y, output_y); + } +} diff --git a/src/core/NEON/kernels/NESobel7x7Kernel.cpp b/src/core/NEON/kernels/NESobel7x7Kernel.cpp new file mode 100644 index 0000000000..9761942c69 --- /dev/null +++ b/src/core/NEON/kernels/NESobel7x7Kernel.cpp @@ -0,0 +1,520 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +const int32x4_t minusfour = vdupq_n_s32(-4); +const int32x4_t minusfive = vdupq_n_s32(-5); +const int32x4_t four = vdupq_n_s32(4); +const int32x4_t five = vdupq_n_s32(5); +const int32x4_t six = vdupq_n_s32(6); +const int32x4_t fifteen = vdupq_n_s32(15); +const int32x4_t twenty = vdupq_n_s32(20); + +inline int32x4x2_t compute_hor_sobel_x(const int32x4x4_t &data) +{ + int32x4x2_t out = + { + { + vnegq_s32(data.val[0]), + vnegq_s32(data.val[1]) + } + }; + + out.val[0] = vmlaq_s32(out.val[0], + vextq_s32(data.val[0], data.val[1], 1), minusfour); + + out.val[0] = vmlaq_s32(out.val[0], + vextq_s32(data.val[0], data.val[1], 2), minusfive); + + out.val[0] = vmlaq_s32(out.val[0], data.val[1], five); + + out.val[0] = vmlaq_s32(out.val[0], + vextq_s32(data.val[1], data.val[2], 1), four); + + out.val[0] = vaddq_s32(out.val[0], + vextq_s32(data.val[1], data.val[2], 2)); + + out.val[1] = vmlaq_s32(out.val[1], + vextq_s32(data.val[1], data.val[2], 1), minusfour); + + out.val[1] = vmlaq_s32(out.val[1], + vextq_s32(data.val[1], data.val[2], 2), minusfive); + + out.val[1] = vmlaq_s32(out.val[1], data.val[2], five); + + out.val[1] = vmlaq_s32(out.val[1], + vextq_s32(data.val[2], data.val[3], 1), four); + + out.val[1] = vaddq_s32(out.val[1], + vextq_s32(data.val[2], data.val[3], 2)); + + return out; +} + +inline int32x4x2_t compute_hor_sobel_y(const int32x4x4_t &data) +{ + int32x4x2_t out = + { + { + data.val[0], + data.val[1] + } + }; + + out.val[0] = vmlaq_s32(out.val[0], + vextq_s32(data.val[0], data.val[1], 1), six); + + out.val[0] = vmlaq_s32(out.val[0], + vextq_s32(data.val[0], data.val[1], 2), fifteen); + + out.val[0] = vmlaq_s32(out.val[0], + vextq_s32(data.val[0], data.val[1], 3), twenty); + + out.val[0] = vmlaq_s32(out.val[0], data.val[1], fifteen); + + out.val[0] = vmlaq_s32(out.val[0], + vextq_s32(data.val[1], data.val[2], 1), six); + + out.val[0] = vaddq_s32(out.val[0], + vextq_s32(data.val[1], data.val[2], 2)); + + out.val[1] = vmlaq_s32(out.val[1], + vextq_s32(data.val[1], data.val[2], 1), six); + + out.val[1] = vmlaq_s32(out.val[1], + vextq_s32(data.val[1], data.val[2], 2), fifteen); + + out.val[1] = vmlaq_s32(out.val[1], + vextq_s32(data.val[1], data.val[2], 3), twenty); + + out.val[1] = vmlaq_s32(out.val[1], data.val[2], fifteen); + + out.val[1] = vmlaq_s32(out.val[1], + vextq_s32(data.val[2], data.val[3], 1), six); + + out.val[1] = vaddq_s32(out.val[1], + vextq_s32(data.val[2], data.val[3], 2)); + + return out; +} +} // namespace + +NESobel7x7HorKernel::NESobel7x7HorKernel() + : _input(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false), _border_size(0) +{ +} + +BorderSize NESobel7x7HorKernel::border_size() const +{ + return _border_size; +} + +void NESobel7x7HorKernel::configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input, Format::U8); + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = output_x != nullptr; + _run_sobel_y = output_y != nullptr; + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32); + } + + _input = input; + _output_x = output_x; + _output_y = output_y; + _border_size = BorderSize(border_undefined ? 0 : 3, 3); + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 16; + constexpr unsigned int num_elems_written_per_iteration = 8; + + Window win = calculate_max_window_horizontal(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowHorizontal(input->info(), -border_size().left, num_elems_read_per_iteration), + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NESobel7x7HorKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Iterator input(_input, window); + Iterator output_x; + Iterator output_y; + + if(_run_sobel_x) + { + output_x = Iterator(_output_x, window); + } + + if(_run_sobel_y) + { + output_y = Iterator(_output_y, window); + } + + if(_run_sobel_y && _run_sobel_x) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr() - 3); + + const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data)); + const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data)); + + const int32x4x4_t data_s32 = + { + { + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16))) + } + }; + + const int32x4x2_t out_y = compute_hor_sobel_y(data_s32); + vst1q_s32(reinterpret_cast(output_y.ptr()), out_y.val[0]); + vst1q_s32(reinterpret_cast(output_y.ptr()) + 4, out_y.val[1]); + + const int32x4x2_t out_x = compute_hor_sobel_x(data_s32); + vst1q_s32(reinterpret_cast(output_x.ptr()), out_x.val[0]); + vst1q_s32(reinterpret_cast(output_x.ptr()) + 4, out_x.val[1]); + }, + input, output_x, output_y); + } + else if(_run_sobel_x) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr() - 3); + + const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data)); + const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data)); + + const int32x4x4_t data_s32 = + { + { + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16))) + } + }; + + const int32x4x2_t out = compute_hor_sobel_x(data_s32); + vst1q_s32(reinterpret_cast(output_x.ptr()), out.val[0]); + vst1q_s32(reinterpret_cast(output_x.ptr()) + 4, out.val[1]); + }, + input, output_x); + } + else if(_run_sobel_y) + { + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr() - 3); + + const uint16x8_t tmp_low_u16 = vmovl_u8(vget_low_u8(data)); + const uint16x8_t tmp_high_u16 = vmovl_u8(vget_high_u8(data)); + + const int32x4x4_t data_s32 = + { + { + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_low_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_low_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_low_u16(tmp_high_u16))), + vreinterpretq_s32_u32(vmovl_u16(vget_high_u16(tmp_high_u16))) + } + }; + + const int32x4x2_t out = compute_hor_sobel_x(data_s32); + vst1q_s32(reinterpret_cast(output_y.ptr()), out.val[0]); + vst1q_s32(reinterpret_cast(output_y.ptr()) + 4, out.val[1]); + }, + input, output_y); + } +} + +NESobel7x7VertKernel::NESobel7x7VertKernel() + : _input_x(nullptr), _input_y(nullptr), _output_x(nullptr), _output_y(nullptr), _run_sobel_x(false), _run_sobel_y(false) +{ +} + +BorderSize NESobel7x7VertKernel::border_size() const +{ + return BorderSize(3, 0); +} + +void NESobel7x7VertKernel::configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined) +{ + ARM_COMPUTE_ERROR_ON((output_x == nullptr) && (output_y == nullptr)); + + _run_sobel_x = (output_x != nullptr); + _run_sobel_y = (output_y != nullptr); + + if(_run_sobel_x) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_x, Format::S32); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_x, Format::S32); + } + + if(_run_sobel_y) + { + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(input_y, Format::S32); + ARM_COMPUTE_ERROR_ON_FORMAT_NOT_IN(output_y, Format::S32); + } + + _input_x = input_x; + _input_y = input_y; + _output_x = output_x; + _output_y = output_y; + + const ITensor *const input = _run_sobel_x ? input_x : input_y; + + // Configure kernel window + constexpr unsigned int num_elems_processed_per_iteration = 8; + constexpr unsigned int num_elems_read_per_iteration = 8; + constexpr unsigned int num_elems_written_per_iteration = 8; + constexpr unsigned int num_rows_read_per_iteration = 7; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration), border_undefined, border_size()); + AccessWindowHorizontal output_x_access(output_x == nullptr ? nullptr : output_x->info(), 0, num_elems_written_per_iteration); + AccessWindowHorizontal output_y_access(output_y == nullptr ? nullptr : output_y->info(), 0, num_elems_written_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input_x == nullptr ? nullptr : input_x->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + AccessWindowRectangle(input_y == nullptr ? nullptr : input_y->info(), 0, -border_size().top, num_elems_read_per_iteration, num_rows_read_per_iteration), + output_x_access, + output_y_access); + + output_x_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + output_y_access.set_valid_region(win, input->info()->valid_region(), border_undefined, border_size()); + + INEKernel::configure(win); +} + +void NESobel7x7VertKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + Iterator input_x; + Iterator input_y; + Iterator output_x; + Iterator output_y; + + int32_t in_x_stride = 0; + int32_t in_y_stride = 0; + + if(_run_sobel_x) + { + input_x = Iterator(_input_x, window); + output_x = Iterator(_output_x, window); + in_x_stride = _input_x->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_x->info()->format()); + } + + if(_run_sobel_y) + { + input_y = Iterator(_input_y, window); + output_y = Iterator(_output_y, window); + in_y_stride = _input_y->info()->strides_in_bytes()[1] / pixel_size_from_format(_input_y->info()->format()); + } + + if(_run_sobel_x) + { + execute_window_loop(window, [&](const Coordinates & id) + { + auto in_ptr = reinterpret_cast(input_x.ptr()) - 3 * in_x_stride; + + //top3 + int32x4x2_t data = + { + { + vld1q_s32(in_ptr), + vld1q_s32(in_ptr + 4) + } + }; + + int32x4x2_t out = data; + + //top2 + in_ptr += in_x_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], six); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], six); + + //top + in_ptr += in_x_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen); + + //mid + in_ptr += in_x_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], twenty); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], twenty); + + //low + in_ptr += in_x_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], fifteen); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], fifteen); + + //low2 + in_ptr += in_x_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], six); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], six); + + //low3 + in_ptr += in_x_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vaddq_s32(out.val[0], data.val[0]); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vaddq_s32(out.val[1], data.val[1]); + + vst1q_s32(reinterpret_cast(output_x.ptr()) + 0, out.val[0]); + vst1q_s32(reinterpret_cast(output_x.ptr()) + 4, out.val[1]); + }, + input_x, output_x); + } + + if(_run_sobel_y) + { + execute_window_loop(window, [&](const Coordinates & id) + { + auto in_ptr = reinterpret_cast(input_y.ptr()) - 3 * in_y_stride; + + //top3 + int32x4x2_t data = + { + { + vld1q_s32(in_ptr), + vld1q_s32(in_ptr + 4) + } + }; + + int32x4x2_t out = + { + { + vnegq_s32(data.val[0]), + vnegq_s32(data.val[1]) + } + }; + + //top2 + in_ptr += in_y_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfour); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfour); + + //top + in_ptr += in_y_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], minusfive); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], minusfive); + + //low + in_ptr += (2 * in_y_stride); + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], five); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], five); + + //low2 + in_ptr += in_y_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vmlaq_s32(out.val[0], data.val[0], four); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vmlaq_s32(out.val[1], data.val[1], four); + + //low3 + in_ptr += in_y_stride; + data.val[0] = vld1q_s32(in_ptr); + out.val[0] = vaddq_s32(out.val[0], data.val[0]); + + data.val[1] = vld1q_s32(in_ptr + 4); + out.val[1] = vaddq_s32(out.val[1], data.val[1]); + + vst1q_s32(reinterpret_cast(output_y.ptr()) + 0, out.val[0]); + vst1q_s32(reinterpret_cast(output_y.ptr()) + 4, out.val[1]); + }, + input_y, output_y); + } +} diff --git a/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp new file mode 100644 index 0000000000..942662e84b --- /dev/null +++ b/src/core/NEON/kernels/NESoftmaxLayerKernel.cpp @@ -0,0 +1,474 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/NEON/NEFixedPoint.h" +#include "arm_compute/core/NEON/NEMath.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include +#include +#include + +using namespace arm_compute; + +namespace +{ +void logits_1d_max_f32(const ITensor *in, ITensor *out, const Window &window) +{ + Window in_slice = window.first_slice_window_1D(); + + Window window_max(window); + window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + Window max_slice = window_max.first_slice_window_1D(); + + do + { + Iterator input(in, in_slice); + Iterator output(out, max_slice); + + float32x4_t vec_max = vdupq_n_f32(-FLT_MAX); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const float32x4_t current_value = vld1q_f32(in_ptr); + vec_max = vmaxq_f32(vec_max, current_value); + }, + input); + + float32x2_t carry_max = vpmax_f32(vget_high_f32(vec_max), vget_low_f32(vec_max)); + carry_max = vpmax_f32(carry_max, carry_max); + + *(reinterpret_cast(output.ptr())) = vget_lane_f32(carry_max, 0); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); +} + +void logits_1d_max_qs8(const ITensor *in, ITensor *out, const Window &window) +{ + Window in_slice = window.first_slice_window_1D(); + + Window window_max(window); + window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + Window max_slice = window_max.first_slice_window_1D(); + + do + { + Iterator input(in, in_slice); + Iterator output(out, max_slice); + + qint8x16_t vec_max = vdupq_n_s8(-1); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const qint8x16_t current_value = vld1q_qs8(in_ptr); + vec_max = vmaxq_qs8(vec_max, current_value); + }, + input); + + qint8x8_t carry_max = vpmax_qs8(vget_high_s8(vec_max), vget_low_s8(vec_max)); + carry_max = vpmax_qs8(carry_max, carry_max); + carry_max = vpmax_qs8(carry_max, carry_max); + carry_max = vpmax_qs8(carry_max, carry_max); + + *(reinterpret_cast(output.ptr())) = vget_lane_s8(carry_max, 0); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); +} +} // namespace + +NELogits1DMaxKernel::NELogits1DMaxKernel() + : _func(nullptr), _border_size() +{ +} + +BorderSize NELogits1DMaxKernel::border_size() const +{ + return _border_size; +} + +void NELogits1DMaxKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + const int input_width = input->info()->valid_region().shape.x(); + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->data_type()) + { + case DataType::QS8: + _func = &logits_1d_max_qs8; + num_elems_processed_per_iteration = 16; + break; + case DataType::F32: + num_elems_processed_per_iteration = 4; + _func = &logits_1d_max_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + _input = input; + _output = output; + _border_size = BorderSize(0, input_width % num_elems_processed_per_iteration, 0, 0); + + // Configure kernel window + constexpr unsigned int num_elems_written_per_row = 1; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_written_per_row, 1.f / input_width); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NELogits1DMaxKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _output, window); +} + +namespace +{ +void logits_1d_shift_exp_sum_f32(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window) +{ + Window window_max(window); + window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + + Window max_slice = window_max.first_slice_window_1D(); + Window in_slice = window.first_slice_window_1D(); + + constexpr int step = 4; + const int long_steps = in->info()->valid_region().shape.x() / step; + const int small_steps = in->info()->valid_region().shape.x() % step; + + do + { + Iterator input(in, in_slice); + Iterator exp(out, in_slice); + Iterator _max(max, max_slice); + Iterator _sum(sum, max_slice); + + // Get pointers + auto in_ptr = reinterpret_cast(input.ptr()); + auto exp_ptr = reinterpret_cast(exp.ptr()); + + // Init sum to zero + float32x4_t vec_sum_value = vdupq_n_f32(0.0f); + + // Get max value + const auto max_ptr = reinterpret_cast(_max.ptr()); + const float32x4_t vec_max = vdupq_n_f32(*max_ptr); + + // Run neon loop + for(int i = 0; i < long_steps; ++i) + { + float32x4_t vec_elements = vld1q_f32(in_ptr); + vec_elements = vsubq_f32(vec_elements, vec_max); + vec_elements = vexpq_f32(vec_elements); + + vst1q_f32(exp_ptr, vec_elements); + vec_sum_value = vaddq_f32(vec_elements, vec_sum_value); + + in_ptr += step; + exp_ptr += step; + } + + // Reduce sum + float32x2_t carry_addition = vpadd_f32(vget_high_f32(vec_sum_value), vget_low_f32(vec_sum_value)); + carry_addition = vpadd_f32(carry_addition, carry_addition); + float sum = vget_lane_f32(carry_addition, 0); + + // Run remaining elements + for(int i = 0; i < small_steps; ++i) + { + float element = std::exp(in_ptr[i] - *max_ptr); + exp_ptr[i] = element; + sum += element; + } + + *(reinterpret_cast(_sum.ptr())) = sum; + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); +} +void logits_1d_shift_exp_sum_qs8(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window) +{ + Window window_max(window); + window_max.set(Window::DimX, Window::Dimension(0, 0, 0)); + + Window max_slice = window_max.first_slice_window_1D(); + Window in_slice = window.first_slice_window_1D(); + + constexpr int step = 8; + const int long_steps = in->info()->valid_region().shape.x() / step; + const int small_steps = in->info()->valid_region().shape.x() % step; + const int fixed_point_position = in->info()->fixed_point_position(); + + do + { + Iterator input(in, in_slice); + Iterator exp(out, in_slice); + Iterator _max(max, max_slice); + Iterator _sum(sum, max_slice); + + // Get pointers + auto in_ptr = reinterpret_cast(input.ptr()); + auto exp_ptr = reinterpret_cast(exp.ptr()); + + // Init sum to zero + qint16x8_t vec_sum_value = vdupq_n_qs16(0); + + // Get max value + const auto max_ptr = reinterpret_cast(_max.ptr()); + const qint8x8_t vec_max = vdup_n_qs8(*max_ptr); + + // Run neon loop + for(int i = 0; i < long_steps; ++i) + { + qint8x8_t vec_elements = vld1_qs8(in_ptr); + vec_elements = vqsub_qs8(vec_elements, vec_max); + vec_elements = vqexp_qs8(vec_elements, fixed_point_position); + + vst1_qs8(exp_ptr, vec_elements); + vec_sum_value = vqaddq_qs16(vec_sum_value, vmovl_s8(vec_elements)); + + in_ptr += step; + exp_ptr += step; + } + // Reduce sum + const qint16x4_t sum_red = vqadd_qs16(vget_low_s16(vec_sum_value), vget_high_s16(vec_sum_value)); + const qint16_t sum0 = sqadd_qs16(vget_lane_s16(sum_red, 0), vget_lane_s16(sum_red, 1)); + const qint16_t sum1 = sqadd_qs16(vget_lane_s16(sum_red, 2), vget_lane_s16(sum_red, 3)); + qint16_t sum = sqadd_qs16(sum0, sum1); + + // Run remaining elements + for(int i = 0; i < small_steps; ++i) + { + qint8_t element = sqexp_qs8(sqsub_qs8(in_ptr[i], *max_ptr), fixed_point_position); + exp_ptr[i] = element; + sum = sqadd_qs16(sum, element); + } + + *(reinterpret_cast(_sum.ptr())) = sqmovn_qs16(sum); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(max_slice)); +} +} //namespace + +NELogits1DShiftExpSumKernel::NELogits1DShiftExpSumKernel() + : _func(nullptr), _input(nullptr), _max(nullptr), _output(nullptr), _sum(nullptr) +{ +} + +void NELogits1DShiftExpSumKernel::configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(max, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, max, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, max, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(max, sum); + + unsigned int num_elems_processed_per_iteration = input->info()->valid_region().shape.x(); + + switch(input->info()->data_type()) + { + case DataType::QS8: + _func = &logits_1d_shift_exp_sum_qs8; + break; + case DataType::F32: + _func = &logits_1d_shift_exp_sum_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + _input = input; + _max = max; + _output = output; + _sum = sum; + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal max_access(max->info(), 0, 1); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + AccessWindowHorizontal sum_access(sum->info(), 0, 1); + + update_window_and_padding(win, input_access, max_access, output_access, sum_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + sum_access.set_valid_region(win, ValidRegion(Coordinates(), sum->info()->tensor_shape())); + + INEKernel::configure(win); +} + +void NELogits1DShiftExpSumKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _max, _output, _sum, window); +} + +namespace +{ +void logits_1d_norm_f32(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) +{ + Window window_sum(window); + window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); + Window sum_slice = window_sum.first_slice_window_1D(); + Window in_slice = window.first_slice_window_1D(); + + do + { + Iterator input(in, in_slice); + Iterator _sum(sum, sum_slice); + Iterator output(out, in_slice); + + const float sum_value = *reinterpret_cast(_sum.ptr()); + const float32x4_t vec_sum_inversed = vdupq_n_f32(1.0f / sum_value); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); + + const float32x4_t vec_in = vld1q_f32(in_ptr); + const float32x4_t normalized_value = vmulq_f32(vec_in, vec_sum_inversed); + + vst1q_f32(out_ptr, normalized_value); + }, + input, output); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); +} +void logits_1d_norm_qs8(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window) +{ + Window window_sum(window); + window_sum.set(Window::DimX, Window::Dimension(0, 0, 0)); + Window sum_slice = window_sum.first_slice_window_1D(); + Window in_slice = window.first_slice_window_1D(); + + const int fixed_point_position = in->info()->fixed_point_position(); + + do + { + Iterator input(in, in_slice); + Iterator _sum(sum, sum_slice); + Iterator output(out, in_slice); + + const int8_t sum_value = *reinterpret_cast(_sum.ptr()); + const qint8x16_t vec_sum_inversed = vqrecipq_qs8(vdupq_n_qs8(sum_value), fixed_point_position); + + execute_window_loop(in_slice, [&](const Coordinates & id) + { + const auto in_ptr = reinterpret_cast(input.ptr()); + const auto out_ptr = reinterpret_cast(output.ptr()); + + const qint8x16_t vec_in = vld1q_qs8(in_ptr); + const qint8x16_t normalized_value = vqmulq_qs8(vec_in, vec_sum_inversed, fixed_point_position); + + vst1q_qs8(out_ptr, normalized_value); + }, + input, output); + } + while(window.slide_window_slice_1D(in_slice) && window.slide_window_slice_1D(sum_slice)); +} +} // namespace + +NELogits1DNormKernel::NELogits1DNormKernel() + : _func(nullptr), _input(nullptr), _sum(nullptr), _output(nullptr) +{ +} + +void NELogits1DNormKernel::configure(const ITensor *input, const ITensor *sum, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output, sum); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output, sum); + ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input, output); + + _input = input; + _sum = sum; + _output = output; + + // Configure kernel window + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->data_type()) + { + case DataType::QS8: + _func = &logits_1d_norm_qs8; + num_elems_processed_per_iteration = 16; + break; + case DataType::F32: + num_elems_processed_per_iteration = 4; + _func = &logits_1d_norm_f32; + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + + AccessWindowHorizontal input_access(input->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic sum_access(sum->info(), 0, 0, 1, sum->info()->dimension(1)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + + update_window_and_padding(win, input_access, sum_access, output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NELogits1DNormKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _sum, _output, window); +} diff --git a/src/core/NEON/kernels/NETableLookupKernel.cpp b/src/core/NEON/kernels/NETableLookupKernel.cpp new file mode 100644 index 0000000000..f0b58d82f6 --- /dev/null +++ b/src/core/NEON/kernels/NETableLookupKernel.cpp @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ILut.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; + +constexpr unsigned int num_num_elems_processed_per_iteration = 16; +} // namespace arm_compute + +NETableLookupKernel::NETableLookupKernel() + : _func(nullptr), _lut(nullptr) +{ +} + +template +void NETableLookupKernel::tableLookup(const Window &window) +{ + uint32_t offset = _lut->index_offset(); + size_t count = _lut->num_elements(); + const auto lut = reinterpret_cast(_lut->buffer()); + unsigned int step = num_num_elems_processed_per_iteration; + + ARM_COMPUTE_ERROR_ON(lut == nullptr); + + Iterator input = Iterator(_input, window); + Iterator output = Iterator(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + auto input_ptr = reinterpret_cast(input.ptr()); + auto output_ptr = reinterpret_cast(output.ptr()); + + for(unsigned int i = 0; i < step; ++i, ++input_ptr, ++output_ptr) + { + const int32_t index = offset + *input_ptr; + + if(0 <= index && index < static_cast(count)) + { + *output_ptr = lut[index]; + } + } + }, + input, output); +} + +namespace arm_compute +{ +template <> +void NETableLookupKernel::tableLookup(const Window &window) +{ + const uint8_t *const lut = _lut->buffer(); + unsigned int step = num_num_elems_processed_per_iteration; + + ARM_COMPUTE_ERROR_ON(lut == nullptr); + + Iterator input = Iterator(_input, window); + Iterator output = Iterator(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8_t *input_ptr = input.ptr(); + uint8_t *output_ptr = output.ptr(); + + for(unsigned int i = 0; i < step; ++i) + { + *output_ptr++ = lut[*input_ptr++]; + } + }, + input, output); +} +} // namespace arm_compute + +void NETableLookupKernel::configure(const ITensor *input, const ILut *lut, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON(input == nullptr); + ARM_COMPUTE_ERROR_ON(lut == nullptr); + ARM_COMPUTE_ERROR_ON(output == nullptr); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + + _lut = lut; + + if(input->info()->data_type() == DataType::U8 && output->info()->data_type() == DataType::U8) + { + _func = &NETableLookupKernel::tableLookup; + } + else if(input->info()->data_type() == DataType::S16 && output->info()->data_type() == DataType::S16) + { + _func = &NETableLookupKernel::tableLookup; + } + else + { + ARM_COMPUTE_ERROR("Unsupported combination of input and output DataType."); + } + + INESimpleKernel::configure(input, output, num_num_elems_processed_per_iteration); +} + +void NETableLookupKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INESimpleKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NEThresholdKernel.cpp b/src/core/NEON/kernels/NEThresholdKernel.cpp new file mode 100644 index 0000000000..72031195d9 --- /dev/null +++ b/src/core/NEON/kernels/NEThresholdKernel.cpp @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +NEThresholdKernel::NEThresholdKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _threshold(0), _false_value(0), _true_value(0), _upper(0) +{ +} + +void NEThresholdKernel::configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + + _input = input; + _output = output; + _threshold = threshold; + _false_value = false_value; + _true_value = true_value; + _upper = upper; + + switch(type) + { + case ThresholdType::BINARY: + _func = &NEThresholdKernel::run_binary; + break; + case ThresholdType::RANGE: + _func = &NEThresholdKernel::run_range; + break; + default: + ARM_COMPUTE_ERROR("Thresholding type not recognized"); + break; + } + + const unsigned int num_elems_processed_per_iteration = 16; + + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration)); + AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration); + update_window_and_padding(win, AccessWindowHorizontal(input->info(), 0, num_elems_processed_per_iteration), output_access); + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +inline void NEThresholdKernel::run_binary(const Window &window) +{ + const uint8x16_t threshold = vdupq_n_u8(_threshold); + const uint8x16_t true_value = vdupq_n_u8(_true_value); + const uint8x16_t false_value = vdupq_n_u8(_false_value); + + Iterator input(_input, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + const uint8x16_t mask = vcgtq_u8(data, threshold); + + vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value)); + }, + input, output); +} + +inline void NEThresholdKernel::run_range(const Window &window) +{ + const uint8x16_t lower_threshold = vdupq_n_u8(_threshold); + const uint8x16_t upper_threshold = vdupq_n_u8(_upper); + const uint8x16_t true_value = vdupq_n_u8(_true_value); + const uint8x16_t false_value = vdupq_n_u8(_false_value); + + Iterator input(_input, window); + Iterator output(_output, window); + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x16_t data = vld1q_u8(input.ptr()); + + uint8x16_t mask = vcleq_u8(data, upper_threshold); + + mask = vandq_u8(vcgeq_u8(data, lower_threshold), mask); + + vst1q_u8(output.ptr(), vbslq_u8(mask, true_value, false_value)); + }, + input, output); +} + +void NEThresholdKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} diff --git a/src/core/NEON/kernels/NETransposeKernel.cpp b/src/core/NEON/kernels/NETransposeKernel.cpp new file mode 100644 index 0000000000..492de8a6ee --- /dev/null +++ b/src/core/NEON/kernels/NETransposeKernel.cpp @@ -0,0 +1,241 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" + +#include "arm_compute/core/AccessWindowTranspose.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Validate.h" + +#include + +using namespace arm_compute; + +namespace arm_compute +{ +class Coordinates; +} // namespace arm_compute + +namespace +{ +void transpose_8bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator input(in, window); + Iterator output(out, window_out); + + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint8x8_t row0 = vld1_u8(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes)); + const uint8x8_t row1 = vld1_u8(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes)); + const uint8x8_t row2 = vld1_u8(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes)); + const uint8x8_t row3 = vld1_u8(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes)); + const uint8x8_t row4 = vld1_u8(reinterpret_cast(input.ptr() + 4 * input_stride_in_bytes)); + const uint8x8_t row5 = vld1_u8(reinterpret_cast(input.ptr() + 5 * input_stride_in_bytes)); + const uint8x8_t row6 = vld1_u8(reinterpret_cast(input.ptr() + 6 * input_stride_in_bytes)); + const uint8x8_t row7 = vld1_u8(reinterpret_cast(input.ptr() + 7 * input_stride_in_bytes)); + + // Transpose 2x2 + const uint8x8x2_t k0_u8 = vtrn_u8(row0, row1); + const uint8x8x2_t k1_u8 = vtrn_u8(row2, row3); + const uint8x8x2_t k2_u8 = vtrn_u8(row4, row5); + const uint8x8x2_t k3_u8 = vtrn_u8(row6, row7); + + // Transpose 4x4 + const uint16x4x2_t k0_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[0]), vreinterpret_u16_u8(k1_u8.val[0])); + const uint16x4x2_t k1_u16 = vtrn_u16(vreinterpret_u16_u8(k0_u8.val[1]), vreinterpret_u16_u8(k1_u8.val[1])); + const uint16x4x2_t k2_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[0]), vreinterpret_u16_u8(k3_u8.val[0])); + const uint16x4x2_t k3_u16 = vtrn_u16(vreinterpret_u16_u8(k2_u8.val[1]), vreinterpret_u16_u8(k3_u8.val[1])); + + // Transpose 8x8 + const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k2_u16.val[0])); + const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k2_u16.val[1])); + const uint32x2x2_t k2_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[0]), vreinterpret_u32_u16(k3_u16.val[0])); + const uint32x2x2_t k3_u32 = vtrn_u32(vreinterpret_u32_u16(k1_u16.val[1]), vreinterpret_u32_u16(k3_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint8_t) + id.x() * output_stride_in_bytes; + + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[0]))); + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[0]))); + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[0]))); + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[0]))); + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 4 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k0_u32.val[1]))); + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 5 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k2_u32.val[1]))); + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 6 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k1_u32.val[1]))); + vst1_u8(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 7 * output_stride_in_bytes), vreinterpret_u8_u16(vreinterpret_u16_u32(k3_u32.val[1]))); + }, + input, output); +} + +void transpose_16bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator input(in, window); + Iterator output(out, window_out); + + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint16x4_t row0 = vld1_u16(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes)); + const uint16x4_t row1 = vld1_u16(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes)); + const uint16x4_t row2 = vld1_u16(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes)); + const uint16x4_t row3 = vld1_u16(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes)); + + // Transpose 2x2 + const uint16x4x2_t k0_u16 = vtrn_u16(row0, row1); + const uint16x4x2_t k1_u16 = vtrn_u16(row2, row3); + + // Transpose 4x4 + const uint32x2x2_t k0_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[0]), vreinterpret_u32_u16(k1_u16.val[0])); + const uint32x2x2_t k1_u32 = vtrn_u32(vreinterpret_u32_u16(k0_u16.val[1]), vreinterpret_u32_u16(k1_u16.val[1])); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint16_t) + id.x() * output_stride_in_bytes; + + vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[0])); + vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[0])); + vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vreinterpret_u16_u32(k0_u32.val[1])); + vst1_u16(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vreinterpret_u16_u32(k1_u32.val[1])); + }, + input, output); +} + +void transpose_32bit_elements(const ITensor *in, ITensor *out, const Window &window) +{ + Window window_out(window); + window_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + window_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator input(in, window); + Iterator output(out, window_out); + + const size_t input_stride_in_bytes = in->info()->strides_in_bytes()[1]; + const size_t output_stride_in_bytes = out->info()->strides_in_bytes()[1]; + + execute_window_loop(window, [&](const Coordinates & id) + { + const uint32x4_t row0 = vld1q_u32(reinterpret_cast(input.ptr() + 0 * input_stride_in_bytes)); + const uint32x4_t row1 = vld1q_u32(reinterpret_cast(input.ptr() + 1 * input_stride_in_bytes)); + const uint32x4_t row2 = vld1q_u32(reinterpret_cast(input.ptr() + 2 * input_stride_in_bytes)); + const uint32x4_t row3 = vld1q_u32(reinterpret_cast(input.ptr() + 3 * input_stride_in_bytes)); + + // Transpose 2x2 + const uint32x2x2_t k0_u32 = vtrn_u32(vget_low_u32(row0), vget_low_u32(row1)); + const uint32x2x2_t k1_u32 = vtrn_u32(vget_high_u32(row2), vget_high_u32(row3)); + const uint32x2x2_t k2_u32 = vtrn_u32(vget_high_u32(row0), vget_high_u32(row1)); + const uint32x2x2_t k3_u32 = vtrn_u32(vget_low_u32(row2), vget_low_u32(row3)); + + // Compute destination address + const size_t dst_offset_in_bytes = id.y() * sizeof(uint32_t) + id.x() * output_stride_in_bytes; + + // Swap block 01 with block 10 and store + vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 0 * output_stride_in_bytes), vcombine_u32(k0_u32.val[0], k3_u32.val[0])); + vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 1 * output_stride_in_bytes), vcombine_u32(k0_u32.val[1], k3_u32.val[1])); + vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 2 * output_stride_in_bytes), vcombine_u32(k2_u32.val[0], k1_u32.val[0])); + vst1q_u32(reinterpret_cast(output.ptr() + dst_offset_in_bytes + 3 * output_stride_in_bytes), vcombine_u32(k2_u32.val[1], k1_u32.val[1])); + }, + input, output); +} +} // namespace + +NETransposeKernel::NETransposeKernel() + : _func(nullptr), _input(nullptr), _output(nullptr) +{ +} + +void NETransposeKernel::configure(const ITensor *input, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8, DataType::S8, DataType::QS8, DataType::U16, DataType::S16, DataType::U32, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON(output == nullptr); + + TensorShape output_shape{ input->info()->tensor_shape() }; + const size_t w_out = input->info()->dimension(1); + const size_t h_out = input->info()->dimension(0); + output_shape.set(0, w_out); + output_shape.set(1, h_out); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + + _input = input; + _output = output; + + unsigned int num_elems_processed_per_iteration = 0; + + switch(input->info()->element_size()) + { + case 1: + _func = &transpose_8bit_elements; + num_elems_processed_per_iteration = 8; + break; + case 2: + _func = &transpose_16bit_elements; + num_elems_processed_per_iteration = 4; + break; + case 4: + _func = &transpose_32bit_elements; + num_elems_processed_per_iteration = 4; + break; + default: + ARM_COMPUTE_ERROR("Element size not supported"); + break; + } + + // Configure kernel window + Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration)); + AccessWindowTranspose output_access(output->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + + update_window_and_padding(win, + AccessWindowRectangle(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration), + output_access); + + output_access.set_valid_region(win, input->info()->valid_region()); + + INEKernel::configure(win); +} + +void NETransposeKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (*_func)(_input, _output, window); +} diff --git a/src/core/NEON/kernels/NEWarpKernel.cpp b/src/core/NEON/kernels/NEWarpKernel.cpp new file mode 100644 index 0000000000..6c90a334af --- /dev/null +++ b/src/core/NEON/kernels/NEWarpKernel.cpp @@ -0,0 +1,651 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEWarpKernel.h" + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/Coordinates.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + +#include + +using namespace arm_compute; + +namespace +{ +inline uint8_t nearest_interpolation(const uint8_t *in_ptr, int x, int y, size_t stride) +{ + return in_ptr[x + y * stride]; +} +} // namespace + +INEWarpKernel::INEWarpKernel() + : _func(nullptr), _input(nullptr), _output(nullptr), _constant_border_value(0), _matrix(nullptr) +{ +} + +void INEWarpKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + ARM_COMPUTE_ERROR_ON(_func == nullptr); + + (this->*_func)(window); +} + +void INEWarpKernel::configure(const ITensor *input, ITensor *output, const float *matrix, BorderMode border_mode, uint8_t constant_border_value) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8); + ARM_COMPUTE_ERROR_ON(nullptr == matrix); + + _matrix = matrix; + _constant_border_value = constant_border_value; + + switch(border_mode) + { + case BorderMode::UNDEFINED: + _func = &INEWarpKernel::warp_undefined; + break; + case BorderMode::CONSTANT: + _func = &INEWarpKernel::warp_constant; + break; + case BorderMode::REPLICATE: + _func = &INEWarpKernel::warp_replicate; + break; + default: + ARM_COMPUTE_ERROR("Border mode not supported"); + break; + } + + _input = input; + _output = output; + + // Configure kernel window + Window win = calculate_max_window(*output->info(), Steps(1U)); + + const ValidRegion &input_valid_region = input->info()->valid_region(); + + // Reads can occur within the valid region of the input + AccessWindowStatic input_access(input->info(), + input_valid_region.anchor[0], input_valid_region.anchor[1], + input_valid_region.anchor[0] + input_valid_region.shape[0], + input_valid_region.anchor[1] + input_valid_region.shape[1]); + AccessWindowHorizontal output_access(output->info(), 0, 1); + + update_window_and_padding(win, input_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(win); +} + +template +void NEWarpAffineKernel::warp_undefined(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + const int min_x = _input->info()->valid_region().anchor[0]; + const int max_x = min_x + _input->info()->valid_region().shape[0]; + const int min_y = _input->info()->valid_region().anchor[1]; + const int max_y = min_y + _input->info()->valid_region().shape[1]; + const size_t stride = _input->info()->strides_in_bytes()[1]; + + // x0 = M01 * x + M01 * y + M02 + // y0 = M11 * x + M11 * y + M12 + const float M00 = _matrix[0]; + const float M10 = _matrix[1]; + const float M01 = _matrix[0 + 1 * 2]; + const float M11 = _matrix[1 + 1 * 2]; + const float M02 = _matrix[0 + 2 * 2]; + const float M12 = _matrix[1 + 2 * 2]; + + // "M00 * x" and "M10 * x", when x = window.x.start + const float start_x0 = M00 * window.x().start(); + const float start_y0 = M10 * window.x().start(); + + // Current row + int y_cur = window.y().start(); + + // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing + float const_x0 = M01 * y_cur + M02; + float const_y0 = M11 * y_cur + M12; + + // Affine warp coordinates + float x0 = start_x0 + const_x0; + float y0 = start_y0 + const_y0; + + execute_window_loop(window, [&](const Coordinates & id) + { + // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0 + if(y_cur != id.y()) + { + y_cur = id.y(); + + const_x0 = M01 * y_cur + M02; + const_y0 = M11 * y_cur + M12; + + x0 = start_x0 + const_x0; + y0 = start_y0 + const_y0; + } + + // Only write to output if x0 and y0 are within the valid region. + // Otherwise the read value would be undefined. + if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x)) + { + switch(interpolation) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride); + break; + case InterpolationPolicy::BILINEAR: + *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0); + break; + default: + ARM_COMPUTE_ERROR("Interpolation not supported"); + } + } + + x0 += M00; + y0 += M10; + }, + in, out); +} + +template +void NEWarpAffineKernel::warp_constant(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + const int min_x = _input->info()->valid_region().anchor[0]; + const int max_x = min_x + _input->info()->valid_region().shape[0]; + const int min_y = _input->info()->valid_region().anchor[1]; + const int max_y = min_y + _input->info()->valid_region().shape[1]; + const size_t stride = _input->info()->strides_in_bytes()[1]; + + // x0 = M01 * x + M01 * y + M02 + // y0 = M11 * x + M11 * y + M12 + const float M00 = _matrix[0]; + const float M10 = _matrix[1]; + const float M01 = _matrix[0 + 1 * 2]; + const float M11 = _matrix[1 + 1 * 2]; + const float M02 = _matrix[0 + 2 * 2]; + const float M12 = _matrix[1 + 2 * 2]; + + // "M00 * x" and "M10 * x", when x = window.x.start + const float start_x0 = M00 * window.x().start(); + const float start_y0 = M10 * window.x().start(); + + // Current row + int y_cur = window.y().start(); + + // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing + float const_x0 = M01 * y_cur + M02; + float const_y0 = M11 * y_cur + M12; + + // Affine warp coordinates + float x0 = start_x0 + const_x0; + float y0 = start_y0 + const_y0; + + execute_window_loop(window, [&](const Coordinates & id) + { + // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0 + if(y_cur != id.y()) + { + y_cur = id.y(); + + const_x0 = M01 * y_cur + M02; + const_y0 = M11 * y_cur + M12; + + x0 = start_x0 + const_x0; + y0 = start_y0 + const_y0; + } + + // Only use input values if x0 and y0 are within the valid region. + // Otherwise write the constant border value. + if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x)) + { + switch(interpolation) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride); + break; + case InterpolationPolicy::BILINEAR: + *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0); + break; + default: + ARM_COMPUTE_ERROR("Interpolation not supported"); + } + } + else + { + *out.ptr() = _constant_border_value; + } + + x0 += M00; + y0 += M10; + }, + in, out); +} + +template +void NEWarpAffineKernel::warp_replicate(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + const int min_x = _input->info()->valid_region().anchor[0]; + const int max_x = min_x + _input->info()->valid_region().shape[0]; + const int min_y = _input->info()->valid_region().anchor[1]; + const int max_y = min_y + _input->info()->valid_region().shape[1]; + const size_t stride = _input->info()->strides_in_bytes()[1]; + + // Current row + int y_cur = window.y().start(); + + const float M00 = _matrix[0]; + const float M10 = _matrix[1]; + const float M01 = _matrix[0 + 1 * 2]; + const float M11 = _matrix[1 + 1 * 2]; + const float M02 = _matrix[0 + 2 * 2]; + const float M12 = _matrix[1 + 2 * 2]; + + // "M00 * x" and "M10 * x", when x = window.x.start + const float start_x0 = M00 * window.x().start(); + const float start_y0 = M10 * window.x().start(); + + // const_x0 and const_y0 are the constant parts of x0 and y0 during the row processing + float const_x0 = M01 * y_cur + M02; + float const_y0 = M11 * y_cur + M12; + + float x0 = start_x0 + const_x0; + float y0 = start_y0 + const_y0; + + execute_window_loop(window, [&](const Coordinates & id) + { + // Check if we are processing a new row. If so, update the current row (y_cur), x0 and y0 + if(y_cur != id.y()) + { + y_cur = id.y(); + + const_x0 = M01 * y_cur + M02; + const_y0 = M11 * y_cur + M12; + + x0 = start_x0 + const_x0; + y0 = start_y0 + const_y0; + } + + // Only load from (x0, y0) if the point is within the valid region. + // Otherwise load from the edge of the valid region. + if((min_y <= y0) && (y0 < max_y) && (min_x <= x0) && (x0 < max_x)) + { + switch(interpolation) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + *out.ptr() = nearest_interpolation(in.ptr(), x0, y0, stride); + break; + case InterpolationPolicy::BILINEAR: + *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, x0, y0); + break; + default: + ARM_COMPUTE_ERROR("Interpolation not supported"); + } + } + else + { + // Clamp coordinates + const auto xi = clamp(x0, min_x, max_x - 1); + const auto yi = clamp(y0, min_y, max_y - 1); + + *out.ptr() = *(in.ptr() + xi + yi * stride); + } + + x0 += M00; + y0 += M10; + }, + in, out); +} + +template +void NEWarpPerspectiveKernel::warp_undefined(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + const int min_x = _input->info()->valid_region().anchor[0]; + const int max_x = min_x + _input->info()->valid_region().shape[0]; + const int min_y = _input->info()->valid_region().anchor[1]; + const int max_y = min_y + _input->info()->valid_region().shape[1]; + const size_t stride = _input->info()->strides_in_bytes()[1]; + + // x0 = M00 * x + M01 * y + M02 + // y0 = M10 * x + M11 * y + M12 + // z0 = M20 * x + M21 * y + M22 + // xn = x0 / z0 + // yn = y0 / z0 + const float M00 = _matrix[0]; + const float M10 = _matrix[1]; + const float M20 = _matrix[2]; + const float M01 = _matrix[0 + 1 * 3]; + const float M11 = _matrix[1 + 1 * 3]; + const float M21 = _matrix[2 + 1 * 3]; + const float M02 = _matrix[0 + 2 * 3]; + const float M12 = _matrix[1 + 2 * 3]; + const float M22 = _matrix[2 + 2 * 3]; + + // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start + const float start_x0 = M00 * window.x().start(); + const float start_y0 = M10 * window.x().start(); + const float start_z0 = M20 * window.x().start(); + + // Current row + int y_cur = window.y().start(); + + // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing + float const_x0 = M01 * y_cur + M02; + float const_y0 = M11 * y_cur + M12; + float const_z0 = M21 * y_cur + M22; + + // Perspective warp coordinates + float x0 = start_x0 + const_x0; + float y0 = start_y0 + const_y0; + float z0 = start_z0 + const_z0; + + execute_window_loop(window, [&](const Coordinates & id) + { + // Check if we are processing a new row. If so, update the current processed row (y_cur), x0, y0 and z0 + if(y_cur != id.y()) + { + y_cur = id.y(); + + const_x0 = M01 * y_cur + M02; + const_y0 = M11 * y_cur + M12; + const_z0 = M21 * y_cur + M22; + + x0 = start_x0 + const_x0; + y0 = start_y0 + const_y0; + z0 = start_z0 + const_z0; + } + + const float xn = x0 / z0; + const float yn = y0 / z0; + + // Only write to output if xn and yn are within the valid region. + // Otherwise the read value would be undefined. + if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x)) + { + switch(interpolation) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride); + break; + case InterpolationPolicy::BILINEAR: + *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn); + break; + default: + ARM_COMPUTE_ERROR("Interpolation not supported"); + } + } + + x0 += M00; + y0 += M10; + z0 += M20; + }, + in, out); +} + +template +void NEWarpPerspectiveKernel::warp_constant(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + const int min_x = _input->info()->valid_region().anchor[0]; + const int max_x = min_x + _input->info()->valid_region().shape[0]; + const int min_y = _input->info()->valid_region().anchor[1]; + const int max_y = min_y + _input->info()->valid_region().shape[1]; + const size_t stride = _input->info()->strides_in_bytes()[1]; + + // x0 = M00 * x + M01 * y + M02 + // y0 = M10 * x + M11 * y + M12 + // z0 = M20 * x + M21 * y + M22 + // xn = x0 / z0 + // yn = y0 / z0 + const float M00 = _matrix[0]; + const float M10 = _matrix[1]; + const float M20 = _matrix[2]; + const float M01 = _matrix[0 + 1 * 3]; + const float M11 = _matrix[1 + 1 * 3]; + const float M21 = _matrix[2 + 1 * 3]; + const float M02 = _matrix[0 + 2 * 3]; + const float M12 = _matrix[1 + 2 * 3]; + const float M22 = _matrix[2 + 2 * 3]; + + // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start + const float start_x0 = M00 * window.x().start(); + const float start_y0 = M10 * window.x().start(); + const float start_z0 = M20 * window.x().start(); + + // Current row + int y_cur = window.y().start(); + + // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing + float const_x0 = M01 * y_cur + M02; + float const_y0 = M11 * y_cur + M12; + float const_z0 = M21 * y_cur + M22; + + // Perspective warp coordinates + float x0 = start_x0 + const_x0; + float y0 = start_y0 + const_y0; + float z0 = start_z0 + const_z0; + + execute_window_loop(window, [&](const Coordinates & id) + { + // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0 + if(y_cur != id.y()) + { + y_cur = id.y(); + + const_x0 = M01 * y_cur + M02; + const_y0 = M11 * y_cur + M12; + const_z0 = M21 * y_cur + M22; + + x0 = start_x0 + const_x0; + y0 = start_y0 + const_y0; + z0 = start_z0 + const_z0; + } + + const float xn = x0 / z0; + const float yn = y0 / z0; + + // Only use input values if xn and yn are within the valid region. + // Otherwise write the constant border value. + if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x)) + { + switch(interpolation) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride); + break; + case InterpolationPolicy::BILINEAR: + *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn); + break; + default: + ARM_COMPUTE_ERROR("Interpolation not supported"); + } + } + else + { + *out.ptr() = _constant_border_value; + } + + x0 += M00; + y0 += M10; + z0 += M20; + }, + in, out); +} + +template +void NEWarpPerspectiveKernel::warp_replicate(const Window &window) +{ + // Don't increment in X and Y direction for the input tensor + // A pointer to the start of this plane is needed as base for the precomputed offsets + Window win_in(window); + win_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + + Iterator in(_input, win_in); + Iterator out(_output, window); + + const int min_x = _input->info()->valid_region().anchor[0]; + const int max_x = min_x + _input->info()->valid_region().shape[0]; + const int min_y = _input->info()->valid_region().anchor[1]; + const int max_y = min_y + _input->info()->valid_region().shape[1]; + const size_t stride = _input->info()->strides_in_bytes()[1]; + + // Current row + int y_cur = window.y().start(); + + // x0 = M00 * x + M01 * y + M02 + // y0 = M10 * x + M11 * y + M12 + // z0 = M20 * x + M21 * y + M22 + // xn = x0 / z0 + // yn = y0 / z0 + const float M00 = _matrix[0]; + const float M10 = _matrix[1]; + const float M20 = _matrix[2]; + const float M01 = _matrix[0 + 1 * 3]; + const float M11 = _matrix[1 + 1 * 3]; + const float M21 = _matrix[2 + 1 * 3]; + const float M02 = _matrix[0 + 2 * 3]; + const float M12 = _matrix[1 + 2 * 3]; + const float M22 = _matrix[2 + 2 * 3]; + + // "M00 * x", "M10 * x" and "M20 * x", when x = window.x.start + const float start_x0 = M00 * window.x().start(); + const float start_y0 = M10 * window.x().start(); + const float start_z0 = M20 * window.x().start(); + + // const_x0, const_y0 and const_z0 are the constant parts of x0, y0 and z0 during the row processing + float const_x0 = M01 * y_cur + M02; + float const_y0 = M11 * y_cur + M12; + float const_z0 = M21 * y_cur + M22; + + // Perspective warp coordinates + float x0 = start_x0 + const_x0; + float y0 = start_y0 + const_y0; + float z0 = start_z0 + const_z0; + + execute_window_loop(window, [&](const Coordinates & id) + { + // Check if we are processing a new row. If so, update the current row (y_cur), x0, y0 and z0 + if(y_cur != id.y()) + { + y_cur = id.y(); + + const_x0 = M01 * y_cur + M02; + const_y0 = M11 * y_cur + M12; + const_z0 = M21 * y_cur + M22; + + x0 = start_x0 + const_x0; + y0 = start_y0 + const_y0; + z0 = start_z0 + const_z0; + } + + const float xn = x0 / z0; + const float yn = y0 / z0; + + // Only load from (x0, y0) if the point is within the valid region. + // Otherwise load from the edge of the valid region. + if((min_y <= yn) && (yn < max_y) && (min_x <= xn) && (xn < max_x)) + { + switch(interpolation) + { + case InterpolationPolicy::NEAREST_NEIGHBOR: + *out.ptr() = nearest_interpolation(in.ptr(), xn, yn, stride); + break; + case InterpolationPolicy::BILINEAR: + *out.ptr() = pixel_bilinear_c1u8(in.ptr(), stride, xn, yn); + break; + default: + ARM_COMPUTE_ERROR("Interpolation not supported"); + } + } + else + { + // Clamp coordinates + const auto xi = clamp(x0, min_x, max_x - 1); + const auto yi = clamp(y0, min_y, max_y - 1); + + *out.ptr() = *(in.ptr() + xi + yi * stride); + } + + x0 += M00; + y0 += M10; + z0 += M20; + }, + in, out); +} + +template class arm_compute::NEWarpAffineKernel; +template class arm_compute::NEWarpAffineKernel; +template class arm_compute::NEWarpPerspectiveKernel; +template class arm_compute::NEWarpPerspectiveKernel; diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp new file mode 100644 index 0000000000..aa6be44bee --- /dev/null +++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" + +#include "arm_compute/core/Dimensions.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" + +using namespace arm_compute; + +namespace +{ +template +void weights_reshape(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window) +{ + const unsigned int kernel_size = input->info()->dimension(0); + const unsigned int kernel_depth = input->info()->dimension(2); + const unsigned int input_stride_x = input->info()->strides_in_bytes().x(); + const unsigned int input_stride_y = input->info()->strides_in_bytes().y(); + const unsigned int input_stride_z = input->info()->strides_in_bytes().z(); + const unsigned int output_stride_y = output->info()->strides_in_bytes().y(); + + // Create iterators + Iterator in(input, window); + execute_window_loop(window, [&](const Coordinates & id) + { + // Get column index + const int kernel_idx = id[3]; + const int kernel_idz = id[4]; + + // Setup pointers + const uint8_t *tmp_input_ptr = in.ptr(); + uint8_t *tmp_output_ptr = output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz)); + const uint8_t *curr_input_row_ptr = tmp_input_ptr; + const uint8_t *curr_input_depth_ptr = tmp_input_ptr; + + // Linearize volume + for(unsigned int d = 0; d < kernel_depth; ++d) + { + for(unsigned int j = 0; j < kernel_size; ++j) + { + for(unsigned int i = 0; i < kernel_size; ++i) + { + *(reinterpret_cast(tmp_output_ptr)) = *(reinterpret_cast(tmp_input_ptr)); + tmp_input_ptr += input_stride_x; + tmp_output_ptr += output_stride_y; + } + curr_input_row_ptr += input_stride_y; + tmp_input_ptr = curr_input_row_ptr; + } + curr_input_depth_ptr += input_stride_z; + curr_input_row_ptr = curr_input_depth_ptr; + tmp_input_ptr = curr_input_depth_ptr; + } + + // Add bias + if(bias != nullptr) + { + *(reinterpret_cast(tmp_output_ptr)) = *(reinterpret_cast(bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)))); + } + }, + in); +} +} // namespace + +NEWeightsReshapeKernel::NEWeightsReshapeKernel() + : _func(nullptr), _input(nullptr), _bias(nullptr), _output(nullptr) +{ +} + +void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output) +{ + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_NULLPTR(output); + ARM_COMPUTE_ERROR_ON(input->info()->dimension(0) != input->info()->dimension(1)); + + const DataType dt = input->info()->data_type(); + const int fixed_point_position = input->info()->fixed_point_position(); + + TensorShape output_shape{ input->info()->tensor_shape() }; + output_shape.collapse(3); + const size_t tmp_dim = output_shape[0]; + output_shape.set(0, output_shape[1]); + output_shape.set(1, tmp_dim + (bias != nullptr ? 1 : 0)); + + // Set data type and shape for output tensor if not yet configured + set_data_type_if_unknown(*output->info(), dt); + set_fixed_point_position_if_zero(*output->info(), fixed_point_position); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + + if(bias != nullptr) + { + TensorShape bias_shape{ input->info()->tensor_shape()[3] }; + + // Set data type and shape for bias tensor if not yet configured + set_data_type_if_unknown(*bias->info(), dt); + set_fixed_point_position_if_zero(*bias->info(), fixed_point_position); + set_shape_if_empty(*bias->info(), bias_shape); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(bias->info()->tensor_shape(), bias_shape); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::F32, DataType::QS8); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + } + + _input = input; + _bias = bias; + _output = output; + + switch(_input->info()->data_type()) + { + case DataType::F32: + { + _func = &weights_reshape; + break; + } + case DataType::QS8: + { + _func = &weights_reshape; + break; + } + default: + { + ARM_COMPUTE_ERROR_ON("Data type not supported"); + break; + } + } + + // Configure kernel + Window window = calculate_max_window(*input->info(), Steps()); + window.set(Window::DimX, Window::Dimension(0, _input->info()->dimension(0), _input->info()->dimension(0))); + window.set(Window::DimY, Window::Dimension(0, _input->info()->dimension(1), _input->info()->dimension(1))); + window.set(Window::DimZ, Window::Dimension(0, _input->info()->dimension(2), _input->info()->dimension(2))); + + // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped + output->info()->set_valid_region(ValidRegion(Coordinates(), output->info()->tensor_shape())); + + INEKernel::configure(window); +} + +void NEWeightsReshapeKernel::run(const Window &window) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window); + + (*_func)(_input, _bias, _output, window); +} -- cgit v1.2.1