From 6ff3b19ee6120edf015fad8caab2991faa3070af Mon Sep 17 00:00:00 2001 From: Anthony Barbier Date: Mon, 4 Sep 2017 18:44:23 +0100 Subject: COMPMID-344 Updated doxygen Change-Id: I32f7b84daa560e460b77216add529c8fa8b327ae --- arm_compute/core/NEON/INEKernel.h | 33 + arm_compute/core/NEON/INESimpleKernel.h | 33 + arm_compute/core/NEON/NEColorConvertHelper.inl | 888 +++++++++++++++++ arm_compute/core/NEON/NEFixedPoint.h | 686 +++++++++++++ arm_compute/core/NEON/NEFixedPoint.inl | 1018 ++++++++++++++++++++ arm_compute/core/NEON/NEKernels.h | 96 ++ arm_compute/core/NEON/NEMath.h | 96 ++ arm_compute/core/NEON/NEMath.inl | 141 +++ .../core/NEON/kernels/NEAbsoluteDifferenceKernel.h | 82 ++ arm_compute/core/NEON/kernels/NEAccumulateKernel.h | 122 +++ .../core/NEON/kernels/NEActivationLayerKernel.h | 84 ++ .../core/NEON/kernels/NEArithmeticAdditionKernel.h | 79 ++ .../NEON/kernels/NEArithmeticSubtractionKernel.h | 79 ++ .../NEON/kernels/NEBatchNormalizationLayerKernel.h | 78 ++ arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h | 68 ++ arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h | 66 ++ arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h | 68 ++ arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h | 68 ++ arm_compute/core/NEON/kernels/NEBox3x3Kernel.h | 62 ++ arm_compute/core/NEON/kernels/NECannyEdgeKernel.h | 190 ++++ .../core/NEON/kernels/NEChannelCombineKernel.h | 125 +++ .../core/NEON/kernels/NEChannelExtractKernel.h | 109 +++ arm_compute/core/NEON/kernels/NECol2ImKernel.h | 100 ++ .../core/NEON/kernels/NEColorConvertKernel.h | 88 ++ .../core/NEON/kernels/NEConvolutionKernel.h | 251 +++++ .../NEON/kernels/NECumulativeDistributionKernel.h | 80 ++ .../core/NEON/kernels/NEDepthConcatenateKernel.h | 76 ++ .../core/NEON/kernels/NEDepthConvertKernel.h | 68 ++ arm_compute/core/NEON/kernels/NEDerivativeKernel.h | 94 ++ arm_compute/core/NEON/kernels/NEDilateKernel.h | 49 + .../NEDirectConvolutionLayerBiasAccumulateKernel.h | 74 ++ .../NEON/kernels/NEDirectConvolutionLayerKernel.h | 76 ++ arm_compute/core/NEON/kernels/NEErodeKernel.h | 49 + .../core/NEON/kernels/NEFastCornersKernel.h | 72 ++ arm_compute/core/NEON/kernels/NEFillArrayKernel.h | 73 ++ arm_compute/core/NEON/kernels/NEFillBorderKernel.h | 79 ++ .../core/NEON/kernels/NEFillInnerBorderKernel.h | 75 ++ .../core/NEON/kernels/NEGEMMInterleave4x4Kernel.h | 79 ++ .../NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h | 88 ++ .../kernels/NEGEMMMatrixAccumulateBiasesKernel.h | 63 ++ .../core/NEON/kernels/NEGEMMMatrixAdditionKernel.h | 81 ++ .../core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h | 75 ++ .../core/NEON/kernels/NEGEMMTranspose1xWKernel.h | 82 ++ .../core/NEON/kernels/NEGaussian3x3Kernel.h | 50 + .../core/NEON/kernels/NEGaussian5x5Kernel.h | 73 ++ .../core/NEON/kernels/NEGaussianPyramidKernel.h | 100 ++ .../core/NEON/kernels/NEHOGDescriptorKernel.h | 141 +++ .../core/NEON/kernels/NEHOGDetectorKernel.h | 87 ++ .../core/NEON/kernels/NEHarrisCornersKernel.h | 126 +++ arm_compute/core/NEON/kernels/NEHistogramKernel.h | 129 +++ arm_compute/core/NEON/kernels/NEIm2ColKernel.h | 114 +++ .../core/NEON/kernels/NEIntegralImageKernel.h | 50 + arm_compute/core/NEON/kernels/NELKTrackerKernel.h | 144 +++ .../NELocallyConnectedMatrixMultiplyKernel.h | 64 ++ .../core/NEON/kernels/NEMagnitudePhaseKernel.h | 164 ++++ arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h | 76 ++ arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h | 50 + .../core/NEON/kernels/NEMinMaxLocationKernel.h | 161 ++++ .../core/NEON/kernels/NENonLinearFilterKernel.h | 147 +++ .../NEON/kernels/NENonMaximaSuppression3x3Kernel.h | 99 ++ .../core/NEON/kernels/NENormalizationLayerKernel.h | 106 ++ .../NEON/kernels/NEPixelWiseMultiplicationKernel.h | 105 ++ .../core/NEON/kernels/NEPoolingLayerKernel.h | 106 ++ arm_compute/core/NEON/kernels/NERemapKernel.h | 78 ++ arm_compute/core/NEON/kernels/NEScaleKernel.h | 89 ++ arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h | 82 ++ arm_compute/core/NEON/kernels/NESobel3x3Kernel.h | 82 ++ arm_compute/core/NEON/kernels/NESobel5x5Kernel.h | 118 +++ arm_compute/core/NEON/kernels/NESobel7x7Kernel.h | 122 +++ .../core/NEON/kernels/NESoftmaxLayerKernel.h | 135 +++ .../core/NEON/kernels/NETableLookupKernel.h | 76 ++ arm_compute/core/NEON/kernels/NEThresholdKernel.h | 81 ++ arm_compute/core/NEON/kernels/NETransposeKernel.h | 78 ++ arm_compute/core/NEON/kernels/NEWarpKernel.h | 117 +++ .../core/NEON/kernels/NEWeightsReshapeKernel.h | 94 ++ 75 files changed, 9287 insertions(+) create mode 100644 arm_compute/core/NEON/INEKernel.h create mode 100644 arm_compute/core/NEON/INESimpleKernel.h create mode 100644 arm_compute/core/NEON/NEColorConvertHelper.inl create mode 100644 arm_compute/core/NEON/NEFixedPoint.h create mode 100644 arm_compute/core/NEON/NEFixedPoint.inl create mode 100644 arm_compute/core/NEON/NEKernels.h create mode 100644 arm_compute/core/NEON/NEMath.h create mode 100644 arm_compute/core/NEON/NEMath.inl create mode 100644 arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEAccumulateKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEActivationLayerKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEBox3x3Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NECannyEdgeKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEChannelCombineKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEChannelExtractKernel.h create mode 100644 arm_compute/core/NEON/kernels/NECol2ImKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEColorConvertKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEConvolutionKernel.h create mode 100644 arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEDepthConvertKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEDerivativeKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEDilateKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEErodeKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEFastCornersKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEFillArrayKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEFillBorderKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEHistogramKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEIm2ColKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEIntegralImageKernel.h create mode 100644 arm_compute/core/NEON/kernels/NELKTrackerKernel.h create mode 100644 arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h create mode 100644 arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h create mode 100644 arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h create mode 100644 arm_compute/core/NEON/kernels/NERemapKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEScaleKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NESobel3x3Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NESobel5x5Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NESobel7x7Kernel.h create mode 100644 arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h create mode 100644 arm_compute/core/NEON/kernels/NETableLookupKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEThresholdKernel.h create mode 100644 arm_compute/core/NEON/kernels/NETransposeKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEWarpKernel.h create mode 100644 arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h (limited to 'arm_compute/core/NEON') diff --git a/arm_compute/core/NEON/INEKernel.h b/arm_compute/core/NEON/INEKernel.h new file mode 100644 index 0000000000..3ac8164a51 --- /dev/null +++ b/arm_compute/core/NEON/INEKernel.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_INEKERNEL_H__ +#define __ARM_COMPUTE_INEKERNEL_H__ + +#include "arm_compute/core/CPP/ICPPKernel.h" + +namespace arm_compute +{ +using INEKernel = ICPPKernel; +} +#endif /*__ARM_COMPUTE_INEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/INESimpleKernel.h b/arm_compute/core/NEON/INESimpleKernel.h new file mode 100644 index 0000000000..ca25532ef1 --- /dev/null +++ b/arm_compute/core/NEON/INESimpleKernel.h @@ -0,0 +1,33 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_INESIMPLEKERNEL_H__ +#define __ARM_COMPUTE_INESIMPLEKERNEL_H__ + +#include "arm_compute/core/CPP/ICPPSimpleKernel.h" + +namespace arm_compute +{ +using INESimpleKernel = ICPPSimpleKernel; +} +#endif /*__ARM_COMPUTE_INESIMPLEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/NEColorConvertHelper.inl b/arm_compute/core/NEON/NEColorConvertHelper.inl new file mode 100644 index 0000000000..9be7c8a658 --- /dev/null +++ b/arm_compute/core/NEON/NEColorConvertHelper.inl @@ -0,0 +1,888 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IMultiImage.h" +#include "arm_compute/core/Utils.h" + +#include + +namespace +{ +constexpr float red_coef_bt709 = 1.5748F; +constexpr float green_coef_bt709 = -0.1873f; +constexpr float green_coef2_bt709 = -0.4681f; +constexpr float blue_coef_bt709 = 1.8556f; + +constexpr float rgb2yuv_bt709_kr = 0.2126f; +constexpr float rgb2yuv_bt709_kb = 0.0722f; +// K_g = 1 - K_r - K_b +constexpr float rgb2yuv_bt709_kg = 0.7152f; +// C_u = 1 / (2 * (1 - K_b)) +constexpr float rgb2yuv_bt709_cu = 0.5389f; +// C_v = 1 / (2 * (1 - K_r)) +constexpr float rgb2yuv_bt709_cv = 0.6350f; + +inline void convert_uint8x16_to_float32x4x4(const uint8x16_t &in, float32x4x4_t &out) +{ + const auto tmp1 = vmovl_u8(vget_low_u8(in)); + out.val[0] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp1))); + out.val[1] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp1))); + const auto tmp2 = vmovl_u8(vget_high_u8(in)); + out.val[2] = vcvtq_f32_u32(vmovl_u16(vget_low_u16(tmp2))); + out.val[3] = vcvtq_f32_u32(vmovl_u16(vget_high_u16(tmp2))); +} + +inline void convert_float32x4x3_to_uint8x8x3(const float32x4x3_t &in1, const float32x4x3_t &in2, uint8x8x3_t &out) +{ + out.val[0] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[0])), + vqmovn_u32(vcvtq_u32_f32(in2.val[0])))); + out.val[1] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[1])), + vqmovn_u32(vcvtq_u32_f32(in2.val[1])))); + out.val[2] = vqmovn_u16(vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in1.val[2])), + vqmovn_u32(vcvtq_u32_f32(in2.val[2])))); +} + +inline void convert_float32x4x4_to_unit8x16(const float32x4x4_t &in, uint8x16_t &out) +{ + const auto low = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[0])), + vqmovn_u32(vcvtq_u32_f32(in.val[1]))); + const auto high = vcombine_u16(vqmovn_u32(vcvtq_u32_f32(in.val[2])), + vqmovn_u32(vcvtq_u32_f32(in.val[3]))); + out = vcombine_u8(vqmovn_u16(low), vqmovn_u16(high)); +} + +inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, + float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) +{ + /* + Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' + U'=-0.1146*R' - 0.3854*G' + 0.5000*B' + V'= 0.5000*R' - 0.4542*G' - 0.0458*B' + */ + const auto c128 = vdupq_n_f32(128.f); + + // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b + yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr); + yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg); + yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb); + + // U = (B - Y) / (2 * (1 - K_b)) + uvec = vsubq_f32(bvec, yvec); + uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu); + + // V = (R - Y) / (2 * (1 - K_r)) + vvec = vsubq_f32(rvec, yvec); + vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); +} + +inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, + float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) +{ + float32x4x3_t rgb1, rgb2; + + // Compute: cb - 128 and cr - 128; + const auto c128 = vdupq_n_f32(128.f); + uvec_val = vsubq_f32(uvec_val, c128); + vvec_val = vsubq_f32(vvec_val, c128); + + // Compute: + // r = 0.0000f*f_u + 1.5748f*f_v; + // g = 0.1873f*f_u - 0.4681f*f_v; + // b = 1.8556f*f_u + 0.0000f*f_v; + const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); + const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); + const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), + vmulq_n_f32(vvec_val, green_coef2_bt709)); + + // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. + // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t + // and written back to memory using vst3 instruction + + rgb1.val[0] = vaddq_f32(yvec_val, red); + rgb1.val[1] = vaddq_f32(yvec_val, green); + rgb1.val[2] = vaddq_f32(yvec_val, blue); + + rgb2.val[0] = vaddq_f32(yyvec_val, red); + rgb2.val[1] = vaddq_f32(yyvec_val, green); + rgb2.val[2] = vaddq_f32(yyvec_val, blue); + + uint8x8x3_t u8_rgb; + convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); + + if(!alpha) + { + vst3_lane_u8(&output_ptr[0], u8_rgb, 0); + vst3_lane_u8(&output_ptr[3], u8_rgb, 4); + vst3_lane_u8(&output_ptr[6], u8_rgb, 1); + vst3_lane_u8(&output_ptr[9], u8_rgb, 5); + vst3_lane_u8(&output_ptr[12], u8_rgb, 2); + vst3_lane_u8(&output_ptr[15], u8_rgb, 6); + vst3_lane_u8(&output_ptr[18], u8_rgb, 3); + vst3_lane_u8(&output_ptr[21], u8_rgb, 7); + } + else + { + uint8x8x4_t u8_rgba; + u8_rgba.val[0] = u8_rgb.val[0]; + u8_rgba.val[1] = u8_rgb.val[1]; + u8_rgba.val[2] = u8_rgb.val[2]; + u8_rgba.val[3] = vdup_n_u8(255); + vst4_lane_u8(&output_ptr[0], u8_rgba, 0); + vst4_lane_u8(&output_ptr[4], u8_rgba, 4); + vst4_lane_u8(&output_ptr[8], u8_rgba, 1); + vst4_lane_u8(&output_ptr[12], u8_rgba, 5); + vst4_lane_u8(&output_ptr[16], u8_rgba, 2); + vst4_lane_u8(&output_ptr[20], u8_rgba, 6); + vst4_lane_u8(&output_ptr[24], u8_rgba, 3); + vst4_lane_u8(&output_ptr[28], u8_rgba, 7); + } +} + +inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) +{ + uint8x16x3_t rgb; + + if(alpha) + { + const auto tmp = vld4q_u8(ptr); + rgb.val[0] = tmp.val[0]; + rgb.val[1] = tmp.val[1]; + rgb.val[2] = tmp.val[2]; + } + else + { + rgb = vld3q_u8(ptr); + } + + return rgb; +} + +inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom) +{ + // Convert the uint8x16_t to float32x4x4_t + float32x4x4_t frvec_top, fgvec_top, fbvec_top; + convert_uint8x16_to_float32x4x4(vec_top.val[0], frvec_top); + convert_uint8x16_to_float32x4x4(vec_top.val[1], fgvec_top); + convert_uint8x16_to_float32x4x4(vec_top.val[2], fbvec_top); + + float32x4x4_t frvec_bottom, fgvec_bottom, fbvec_bottom; + convert_uint8x16_to_float32x4x4(vec_bottom.val[0], frvec_bottom); + convert_uint8x16_to_float32x4x4(vec_bottom.val[1], fgvec_bottom); + convert_uint8x16_to_float32x4x4(vec_bottom.val[2], fbvec_bottom); + + float32x4x4_t fyvec_top, fuvec_top, fvvec_top; + float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; + + for(auto i = 0; i < 4; ++i) + { + rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], + fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); + rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], + fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); + } + + convert_float32x4x4_to_unit8x16(fyvec_top, vec_top.val[0]); + convert_float32x4x4_to_unit8x16(fuvec_top, vec_top.val[1]); + convert_float32x4x4_to_unit8x16(fvvec_top, vec_top.val[2]); + convert_float32x4x4_to_unit8x16(fyvec_bottom, vec_bottom.val[0]); + convert_float32x4x4_to_unit8x16(fuvec_bottom, vec_bottom.val[1]); + convert_float32x4x4_to_unit8x16(fvvec_bottom, vec_bottom.val[2]); +} + +inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, + unsigned char *const __restrict out_uv) +{ + uint8x16x3_t vec_top, vec_bottom; + vec_top.val[0] = rvec_top; + vec_top.val[1] = gvec_top; + vec_top.val[2] = bvec_top; + vec_bottom.val[0] = rvec_bottom; + vec_bottom.val[1] = gvec_bottom; + vec_bottom.val[2] = bvec_bottom; + + rgb_to_yuv_conversion(vec_top, vec_bottom); + + vst1q_u8(out_y_top, vec_top.val[0]); + vst1q_u8(out_y_bottom, vec_bottom.val[0]); + + const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]); + const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]); + const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]); + const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]); + + uint8x8x2_t uvvec; + uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp)); + uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp)); + + vst2_u8(out_uv, uvvec); +} + +inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, + unsigned char *const __restrict out_u, + unsigned char *const __restrict out_v) +{ + uint8x16x3_t vec_top, vec_bottom; + vec_top.val[0] = rvec_top; + vec_top.val[1] = gvec_top; + vec_top.val[2] = bvec_top; + vec_bottom.val[0] = rvec_bottom; + vec_bottom.val[1] = gvec_bottom; + vec_bottom.val[2] = bvec_bottom; + + rgb_to_yuv_conversion(vec_top, vec_bottom); + + vst1q_u8(out_y_top, vec_top.val[0]); + vst1q_u8(out_y_bottom, vec_bottom.val[0]); + + const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); + const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); + const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), + vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); + + vst1_u8(out_u, vget_low_u8(uvvec)); + vst1_u8(out_v, vget_high_u8(uvvec)); +} + +inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, + unsigned char *const __restrict out_y, + unsigned char *const __restrict out_u, + unsigned char *const __restrict out_v) +{ + // Convert the uint8x16_t to float32x4x4_t + float32x4x4_t frvec, fgvec, fbvec; + convert_uint8x16_to_float32x4x4(rvec, frvec); + convert_uint8x16_to_float32x4x4(gvec, fgvec); + convert_uint8x16_to_float32x4x4(bvec, fbvec); + + float32x4x4_t fyvec, fuvec, fvvec; + for(auto i = 0; i < 4; ++i) + { + rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], + fyvec.val[i], fuvec.val[i], fvvec.val[i]); + } + + uint8x16_t yvec, uvec, vvec; + convert_float32x4x4_to_unit8x16(fyvec, yvec); + convert_float32x4x4_to_unit8x16(fuvec, uvec); + convert_float32x4x4_to_unit8x16(fvvec, vvec); + + vst1q_u8(out_y, yvec); + vst1q_u8(out_u, uvec); + vst1q_u8(out_v, vvec); +} +} + +namespace arm_compute +{ +void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16x4_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + ta2.val[3] = vdupq_n_u8(255); + vst4q_u8(out.ptr(), ta2); + }, + in, out); +} + +void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta1 = vld4q_u8(in.ptr()); + uint8x16x3_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + vst3q_u8(out.ptr(), ta2); + }, + in, out); +} + +template +void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto element_size = alpha ? 32 : 24; + constexpr auto shift = yuyv ? 0 : 1; + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + float32x4x4_t uvec, yvec, vvec, yyvec; + const auto ta = vld4q_u8(in.ptr()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + // Convert the uint8x16x4_t to float32x4x4_t + convert_uint8x16_to_float32x4x4(ta.val[0 + shift], yvec); + convert_uint8x16_to_float32x4x4(ta.val[1 - shift], uvec); + convert_uint8x16_to_float32x4x4(ta.val[2 + shift], yyvec); + convert_uint8x16_to_float32x4x4(ta.val[3 - shift], vvec); + + yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + }, + in, out); +} + +template +void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto element_size = alpha ? 32 : 24; + const auto out_stride = output_ptr->info()->strides_in_bytes().y(); + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec; + convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top); + convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top); + convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom); + convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom); + convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift], uvec); + convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift], vvec); + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_uv, out); +} + +template +void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto element_size = alpha ? 32 : 24; + const auto out_stride = output_ptr->info()->strides_in_bytes().y(); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_u = vld1q_u8(in_u.ptr()); + const auto ta_v = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u.val[0] = U0 U2 U4 U6 ... + //ta_v.val[0] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top, yyvec_top, yvec_bottom, yyvec_bottom, uvec, vvec; + convert_uint8x16_to_float32x4x4(ta_y_top.val[0], yvec_top); + convert_uint8x16_to_float32x4x4(ta_y_top.val[1], yyvec_top); + convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0], yvec_bottom); + convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1], yyvec_bottom); + convert_uint8x16_to_float32x4x4(ta_u, uvec); + convert_uint8x16_to_float32x4x4(ta_v, vvec); + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_u, in_v, out); +} + +template +void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = yuyv ? 0 : 1; + + // NV12's UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16x2_t uvvec; + uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst2q_u8(out_uv.ptr(), uvvec); + }, + in, out_y, out_uv); +} + +void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + uint8x16x2_t ta_uv; + ta_uv.val[0] = vld1q_u8(in_u.ptr()); + ta_uv.val[1] = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst2q_u8(out_uv.ptr(), ta_uv); + }, + in_y, in_u, in_v, out_y, out_uv); +} + +template +void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); + vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); + }, + in_y, in_uv, out_y, out_u, out_v); +} + +template +void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = yuyv ? 0 : 1; + + // Destination's UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16_t uvec; + uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + vst1q_u8(out_u.ptr(), uvec); + + uint8x16_t vvec; + vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst1q_u8(out_v.ptr(), vvec); + }, + in, out_y, out_u, out_v); +} + +template +void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_uv.val[0 + shift]; + uvec.val[1] = ta_uv.val[0 + shift]; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_uv.val[1 - shift]; + vvec.val[1] = ta_uv.val[1 - shift]; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_uv, out_y, out_u, out_v); +} + +void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_u = vld1q_u8(in_u.ptr()); + const auto ta_v = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u = U0 U2 U4 U6 ... + //ta_v = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_u; + uvec.val[1] = ta_u; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_v; + vvec.val[1] = ta_v; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_u, in_v, out_y, out_u, out_v); +} + +template +void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], + ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], + out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), + out_uv.ptr()); + }, + in, out_y, out_uv); +} + +template +void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], + ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], + out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), + out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); +} + +template +void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast(input); + const auto output_ptr = static_cast(output); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates & id) + { + const auto ta_rgb = load_rgb(in.ptr(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], + out_y.ptr(), out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); +} +} diff --git a/arm_compute/core/NEON/NEFixedPoint.h b/arm_compute/core/NEON/NEFixedPoint.h new file mode 100644 index 0000000000..fb712611cb --- /dev/null +++ b/arm_compute/core/NEON/NEFixedPoint.h @@ -0,0 +1,686 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFIXEDPOINT_H__ +#define __ARM_COMPUTE_NEFIXEDPOINT_H__ + +#include "arm_compute/core/FixedPoint.h" + +#include + +namespace arm_compute +{ +using qint8x8_t = int8x8_t; /**< 8 bit fixed point vector with 8 elements */ +using qint8x8x2_t = int8x8x2_t; /**< 8 bit fixed point vector with 16 elements */ +using qint8x8x3_t = int8x8x3_t; /**< 8 bit fixed point vector with 24 elements */ +using qint8x8x4_t = int8x8x4_t; /**< 8 bit fixed point vector with 32 elements */ +using qint8x16_t = int8x16_t; /**< 8 bit fixed point vector with 16 elements */ +using qint8x16x2_t = int8x16x2_t; /**< 8 bit fixed point vector with 32 elements */ +using qint8x16x3_t = int8x16x3_t; /**< 8 bit fixed point vector with 48 elements */ +using qint8x16x4_t = int8x16x4_t; /**< 8 bit fixed point vector with 64 elements */ +using qint16x4_t = int16x4_t; /**< 16 bit fixed point vector with 4 elements */ +using qint16x4x2_t = int16x4x2_t; /**< 16 bit fixed point vector with 8 elements */ +using qint16x4x3_t = int16x4x3_t; /**< 16 bit fixed point vector with 12 elements */ +using qint16x4x4_t = int16x4x4_t; /**< 16 bit fixed point vector with 16 elements */ +using qint16x8_t = int16x8_t; /**< 16 bit fixed point vector with 8 elements */ +using qint16x8x2_t = int16x8x2_t; /**< 16 bit fixed point vector with 16 elements */ +using qint16x8x3_t = int16x8x3_t; /**< 16 bit fixed point vector with 24 elements */ +using qint16x8x4_t = int16x8x4_t; /**< 16 bit fixed point vector with 32 elements */ + +/** Get the lower half of a 16 elements vector + * + * @param[in] a vector of 16 elements + * + * @return 8 bit fixed point vector (8 elements) + */ +qint8x8_t vget_low_qs8(qint8x16_t a); + +/** Get the higher half of a 16 elements vector + * + * @param[in] a vector of 16 elements + * + * @return 8 bit fixed point vector (8 elements) + */ +qint8x8_t vget_high_qs8(qint8x16_t a); + +/** Load a single 8 bit fixed point vector from memory (8 elements) + * + * @param[in] addr Memory address of the 8 bit fixed point vector to load + * + * @return 8 bit fixed point vector (8 elements) + */ +qint8x8_t vld1_qs8(const qint8_t *addr); + +/** Load a single 8 bit fixed point vector from memory (16 elements) + * + * @param[in] addr Memory address of the 8 bit fixed point vector to load + * + * @return 8 bit fixed point vector (16 elements) + */ +qint8x16_t vld1q_qs8(const qint8_t *addr); + +/** Load a single 16 bit fixed point vector from memory (4 elements) + * + * @param[in] addr Memory address of the 16 bit fixed point vector to load + * + * @return 16 bit fixed point vector (4 elements) + */ +qint16x4_t vld1_qs16(const qint16_t *addr); + +/** Load a single 16 bit fixed point vector from memory (8 elements) + * + * @param[in] addr Memory address of the 16 bit fixed point vector to load + * + * @return 16 bit fixed point vector (8 elements) + */ +qint16x8_t vld1q_qs16(const qint16_t *addr); + +/** Load all lanes of 8 bit fixed point vector with same value from memory (8 elements) + * + * @param[in] addr Memory address of the 8 bit fixed point scalar value to load + * + * @return 8 bit fixed point vector (8 elements) + */ +qint8x8_t vld1_dup_qs8(const qint8_t *addr); + +/** Load all lanes of 8 bit fixed point vector with same value from memory (16 elements) + * + * @param[in] addr Memory address of the 8 bit fixed point scalar value to load + * + * @return 8 bit fixed point vector (16 elements) + */ +qint8x16_t vld1q_dup_qs8(const qint8_t *addr); + +/** Store a single 8 bit fixed point vector to memory (8 elements) + * + * @param[in] addr Memory address where the 8 bit fixed point vector should be stored + * @param[in] b 8 bit fixed point vector to store + * + */ +void vst1_qs8(qint8_t *addr, qint8x8_t b); + +/** Store a single 8 bit fixed point vector to memory (16 elements) + * + * @param[in] addr Memory address where the 8 bit fixed point vector should be stored + * @param[in] b 8 bit fixed point vector to store + * + */ +void vst1q_qs8(qint8_t *addr, qint8x16_t b); + +/** Store a single 16 bit fixed point vector to memory (4 elements) + * + * @param[in] addr Memory address where the 16 bit fixed point vector should be stored + * @param[in] b 16 bit fixed point vector to store + * + */ +void vst1_qs16(qint16_t *addr, qint16x4_t b); + +/** Store a single 8 bit fixed point vector to memory (16 elements) + * + * @param[in] addr Memory address where the 16 bit fixed point vector should be stored + * @param[in] b 16 bit fixed point vector to store + * + */ +void vst1q_qs16(qint16_t *addr, qint16x8_t b); + +/** 16 bit fixed point vector saturating narrow (8 elements) + * + * @param[in] a 16 bit fixed point vector to convert + * + * @return 8 bit fixed point vector + */ +qint8x8_t vqmovn_q16(qint16x8_t a); + +/** 8 bit fixed point vector duplicate (8 elements) + * + * @param[in] a 8 bit fixed point to duplicate + * + * @return The result of the vector duplication + */ +qint8x8_t vdup_n_qs8(qint8_t a); + +/** 8 bit fixed point vector duplicate (16 elements) + * + * @param[in] a 8 bit fixed point to duplicate + * + * @return The result of the vector duplication + */ +qint8x16_t vdupq_n_qs8(qint8_t a); + +/** Duplicate a float and convert it to 8 bit fixed point vector (16 elements) + * + * @param[in] a 8 bit fixed point to duplicate + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the vector duplication + */ +qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position); + +/** 16 bit fixed point vector duplicate (8 elements) + * + * @param[in] a 16 bit fixed point to duplicate + * + * @return The result of the vector duplication + */ +qint16x8_t vdupq_n_qs16(qint16x8_t a); + +/** Absolute value of 8 bit fixed point vector (8 elements) + * + * @param[in] a 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector absolute value + */ +qint8x8_t vabs_qs8(qint8x8_t a); + +/** Absolute value of 8 bit fixed point vector (16 elements) + * + * @param[in] a 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector absolute value + */ +qint8x16_t vabsq_qs8(qint8x16_t a); + +/** Saturating absolute value of 8 bit fixed point vector (8 elements) + * + * @param[in] a 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector absolute value + */ +qint8x8_t vqabs_qs8(qint8x8_t a); + +/** Saturating absolute value of 8 bit fixed point vector (16 elements) + * + * @param[in] a 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector absolute value + */ +qint8x16_t vqabsq_qs8(qint8x16_t a); + +/** 8 bit fixed point vector max (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector max operation + */ +qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector max (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector max operation + */ +qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b); + +/** 8 bit fixed point vector pairwise max (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector pairwise max operation + */ +qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector min (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector max operation + */ +qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector min (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector min operation + */ +qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b); + +/** 8 bit fixed point vector pairwise min (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector pairwise min operation + */ +qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector add (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector addition + */ +qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector add (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector addition + */ +qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b); + +/** 8 bit fixed point vector saturating add (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow + */ +qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector saturating add (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector addition. The result is saturated in case of overflow + */ +qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b); + +/** 16 bit fixed point vector saturating add (4 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow + */ +qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b); + +/** 16 bit fixed point vector saturating add (8 elements) + * + * @param[in] a First 16 bit fixed point input vector + * @param[in] b Second 16 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow + */ +qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b); + +/** 8 bit fixed point vector saturating pairwise add (8 elements) + * + * @param[in] a 8 bit fixed point input vector + * + * @return The result of the 16 bit fixed point vector addition. The result is saturated in case of overflow + */ +int16x4_t vpaddl_qs8(qint8x8_t a); + +/** 8 bit fixed point vector subtraction (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector subtraction + */ +qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector subtraction (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector subtraction + */ +qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b); + +/** 8 bit fixed point vector saturating subtraction (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow + */ +qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b); + +/** 8 bit fixed point vector saturating subtraction (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * + * @return The result of the 8 bit fixed point vector subtraction. The result is saturated in case of overflow + */ +qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b); + +/** 8 bit fixed point vector multiply (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiplication. + */ +qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); + +/** 8 bit fixed point vector multiply (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiplication. + */ +qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); + +/** 8 bit fixed point vector saturating multiply (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow + */ +qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); + +/** 8 bit fixed point vector saturating multiply (16 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiplication. The result is saturated in case of overflow + */ +qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position); + +/** 8 bit fixed point vector long multiply (8 elements) + * + * @param[in] a First 8 bit fixed point input vector + * @param[in] b Second 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point long vector multiplication. + */ +qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position); + +/** 8 bit fixed point vector multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 8 bit fixed point input vector + * @param[in] c Third 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiply-accumulate + */ +qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); + +/** 8 bit fixed point vector multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 8 bit fixed point input vector + * @param[in] c Third 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiply-accumulate + */ +qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position); + +/** 8 bit fixed point vector saturating multiply-accumulate (8 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 8 bit fixed point input vector + * @param[in] c Third 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiply-accumulate. The result is saturated in case of overflow + */ +qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); + +/** 8 bit fixed point vector saturating multiply-accumulate (16 elements). This operation performs the product between @p b and @p c and add the result to @p a (a + b * c). + * + * @param[in] a First 8 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 8 bit fixed point input vector + * @param[in] c Third 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiply-accumulate.The result is saturated in case of overflow + */ +qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position); + +/** 8 bit fixed point vector multiply-accumulate long (8 elements). + * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements + * + * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 8 bit fixed point input vector + * @param[in] c Third 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiply-accumulate long + */ +qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); + +/** 8 bit fixed point vector saturating multiply-accumulate long (8 elements). The saturation is performed on the 16 bit fixed point output vector. + * This operation performs the product between @p b and @p c and add the result to the 16 bit fixed point vector @p a (a + b * c). 8 elements + * + * @param[in] a First 16 bit fixed point input vector where the result of multiplication must be added to + * @param[in] b Second 8 bit fixed point input vector + * @param[in] c Third 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8 bit fixed point vector multiply-accumulate long + */ +qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position); + +/** Convert a float vector with 4x2 elements to 8 bit fixed point vector with 8 elements + * + * @param[in] a Float input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion float -> 8 bit fixed point + */ +qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position); + +/** Convert a float vector with 4x4 elements to 8 bit fixed point vector with 16 elements + * + * @param[in] a Float input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion float -> 8 bit fixed point + */ +qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position); + +/** Convert a 8 bit fixed point vector with 8 elements to a float vector with 4x2 elements + * + * @param[in] a 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion 8 bit fixed point -> float32x2x4 + */ +float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position); + +/** Convert a 8 bit fixed point vector with 16 elements to a float vector with 4x4 elements + * + * @param[in] a 8 bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the conversion 8 bit fixed point -> float32x4x4 + */ +float32x4x4_t vcvtq_qs8_f32(qint8x16_t a, int fixed_point_position); + +/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (8 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit reciprocal (1/a). + */ +qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position); + +/** Calculate reciprocal of a fixed point 8bit number using the Newton-Raphson method. (16 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit reciprocal (1/a). + */ +qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position); + +/** Division fixed point 8bit (8 elements) + * + * @param[in] a First 8bit fixed point input vector + * @param[in] b Second 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The quotient and remainder number in fixed point format. + */ +qint8x8_t vdiv_qs8(qint8x8_t a, int8x8_t b, int fixed_point_position); + +/** Division fixed point 8bit (16 elements) + * + * @param[in] a First 8bit fixed point input vector + * @param[in] b Second 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The quotient and remainder number in 8bit fixed point format. + */ +qint8x16_t vdivq_qs8(qint8x16_t a, int8x16_t b, int fixed_point_position); + +/** Perform a 4th degree polynomial approximation. (8 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit taylor approximation. + */ +template +qint8x8_t vtaylor_poly_qs8(qint8x8_t a, int fixed_point_position); + +/** Perform a 4th degree polynomial approximation. (16 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit taylor approximation. + */ +template +qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position); + +/** Calculate saturating exponential fixed point 8bit (8 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit saturating exponential + */ +qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position); + +/** Calculate saturating exponential fixed point 8bit (16 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit saturating exponential + */ +qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position); + +/** Calculate logarithm fixed point 16bit (8 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit logarithm. + */ +qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position); + +/** Calculate logarithm fixed point 16bit (16 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit logarithm. + */ +qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position); + +/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit inverse sqrt. + */ +qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position); + +/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (8 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit inverse sqrt. + */ +qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position); + +/** Calculate inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit inverse sqrt. + */ +qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); + +/** Calculate saturating inverse square root for fixed point 8bit using Newton-Raphosn method (16 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit inverse sqrt. + */ +qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position); + +/** Calculate hyperbolic tangent for fixed point 8bit (8 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The calculated Hyperbolic Tangent. + */ +qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position); + +/** Calculate hyperbolic tangent for fixed point 8bit (16 elements) + * + * @param[in] a 8bit fixed point input vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The calculated Hyperbolic Tangent. + */ +qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position); + +/** Calculate saturating n power for fixed point 8bit (16 elements). + * + * pow(a,b) = e^(b*log(a)) + * + * @param[in] a 8bit fixed point input vector + * @param[in] b 8bit fixed point power vector + * @param[in] fixed_point_position Fixed point position that expresses the number of bits for the fractional part of the number + * + * @return The result of the 8bit power. + */ +qint8x8_t vqpowq_qs8(qint8x8_t a, qint8x16_t b, int fixed_point_position); +} +#include "arm_compute/core/NEON/NEFixedPoint.inl" +#endif /* __ARM_COMPUTE_NEFIXEDPOINT_H__ */ diff --git a/arm_compute/core/NEON/NEFixedPoint.inl b/arm_compute/core/NEON/NEFixedPoint.inl new file mode 100644 index 0000000000..6db344dc11 --- /dev/null +++ b/arm_compute/core/NEON/NEFixedPoint.inl @@ -0,0 +1,1018 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +namespace arm_compute +{ +/**< Exponent polynomial coefficients for 8 bit fixed point (8 elements) + * Format is in Q0.7 for all elements */ +const std::array exp_tab_qs8 = +{ + { + vdup_n_s8(0x7F), // 0.9978546 + vdup_n_s8(0x3F), // 0.4994721 + vdup_n_s8(0x16), // 0.1763723 + vdup_n_s8(0x05), // 0.0435108 + } +}; + +/**< Exponent polynomial coefficients for 8 bit fixed point (16 elements) + * Format is in Q0.7 for all elements */ +const std::array exp_tabq_qs8 = +{ + { + vdupq_n_s8(0x7F), // 0.9978546 + vdupq_n_s8(0x3F), // 0.4994721 + vdupq_n_s8(0x16), // 0.1763723 + vdupq_n_s8(0x05), // 0.0435108 + } +}; + +/**< Logarithm polynomial coefficients for 8 bit fixed point (8 elements) + * Format is in Q0.7 for all elements except the first one which is in Q1.6 */ +const std::array log_tab_qs8 = +{ + { + vdup_n_s8(0x5C), // 1.4384189 + vdup_n_s8(-0x56), // -0.6771900 + vdup_n_s8(0x29), // 0.3218538 + vdup_n_s8(-0x0A), // -0.0832229 + } +}; + +/**< Logarithm polynomial coefficients for 8 bit fixed point (16 elements) + * Format is in Q0.7 for all elements except the first one which is in Q1.6 */ +const std::array log_tabq_qs8 = +{ + { + vdupq_n_s8(0x5C), // 1.4384189 + vdupq_n_s8(-0x56), // -0.6771900 + vdupq_n_s8(0x29), // 0.3218538 + vdupq_n_s8(-0x0A), // -0.0832229 + } +}; + +inline qint8x8_t vget_low_qs8(qint8x16_t a) +{ + return vget_low_s8(a); +} + +inline qint8x8_t vget_high_qs8(qint8x16_t a) +{ + return vget_high_s8(a); +} + +inline qint8x8_t vld1_qs8(const qint8_t *addr) +{ + return vld1_s8(addr); +} + +inline qint8x16_t vld1q_qs8(const qint8_t *addr) +{ + return vld1q_s8(addr); +} + +inline qint16x4_t vld1_qs16(const qint16_t *addr) +{ + return vld1_s16(addr); +} + +inline qint16x8_t vld1q_qs16(const qint16_t *addr) +{ + return vld1q_s16(addr); +} + +inline qint8x8_t vld1_dup_qs8(const qint8_t *addr) +{ + return vld1_dup_s8(addr); +} + +inline qint8x16_t vld1q_dup_qs8(const qint8_t *addr) +{ + return vld1q_dup_s8(addr); +} + +inline void vst1_qs8(qint8_t *addr, qint8x8_t b) +{ + vst1_s8(addr, b); +} + +inline void vst1q_qs8(qint8_t *addr, qint8x16_t b) +{ + vst1q_s8(addr, b); +} + +inline void vst1_qs16(qint16_t *addr, qint16x4_t b) +{ + vst1_s16(addr, b); +} + +inline void vst1q_qs16(qint16_t *addr, qint16x8_t b) +{ + vst1q_s16(addr, b); +} + +inline qint8x8_t vqmovn_qs16(qint16x8_t a) +{ + return vqmovn_s16(a); +} + +inline qint8x8_t vdup_n_qs8(qint8_t a) +{ + return vdup_n_s8(a); +} + +inline qint8x16_t vdupq_n_qs8(qint8_t a) +{ + return vdupq_n_s8(a); +} + +inline qint8x16_t vdupq_n_qs8_f32(float a, int fixed_point_position) +{ + float32x4x4_t res = + { + { + vdupq_n_f32(a), + vdupq_n_f32(a), + vdupq_n_f32(a), + vdupq_n_f32(a), + } + }; + return vcvtq_qs8_f32(res, fixed_point_position); +} + +inline qint16x8_t vdupq_n_qs16(qint16_t a) +{ + return vdupq_n_s16(a); +} + +inline qint8x8_t vabs_qs8(qint8x8_t a) +{ + return vabs_s8(a); +} + +inline qint8x16_t vabsq_qs8(qint8x16_t a) +{ + return vabsq_s8(a); +} + +inline qint8x8_t vqabs_qs8(qint8x8_t a) +{ + return vqabs_s8(a); +} + +inline qint8x16_t vqabsq_qs8(qint8x16_t a) +{ + return vqabsq_s8(a); +} + +inline qint8x8_t vmax_qs8(qint8x8_t a, qint8x8_t b) +{ + return vmax_s8(a, b); +} + +inline qint8x16_t vmaxq_qs8(qint8x16_t a, qint8x16_t b) +{ + return vmaxq_s8(a, b); +} + +inline qint8x8_t vpmax_qs8(qint8x8_t a, qint8x8_t b) +{ + return vpmax_s8(a, b); +} + +inline qint8x8_t vmin_qs8(qint8x8_t a, qint8x8_t b) +{ + return vmin_s8(a, b); +} + +inline qint8x16_t vminq_qs8(qint8x16_t a, qint8x16_t b) +{ + return vminq_s8(a, b); +} + +inline qint8x8_t vpmin_qs8(qint8x8_t a, qint8x8_t b) +{ + return vpmin_s8(a, b); +} + +inline qint8x8_t vadd_qs8(qint8x8_t a, qint8x8_t b) +{ + return vadd_s8(a, b); +} + +inline qint8x16_t vaddq_qs8(qint8x16_t a, qint8x16_t b) +{ + return vaddq_s8(a, b); +} + +inline qint8x8_t vqadd_qs8(qint8x8_t a, qint8x8_t b) +{ + return vqadd_s8(a, b); +} + +inline qint8x16_t vqaddq_qs8(qint8x16_t a, qint8x16_t b) +{ + return vqaddq_s8(a, b); +} + +inline qint16x4_t vqadd_qs16(qint16x4_t a, qint16x4_t b) +{ + return vqadd_s16(a, b); +} + +inline qint16x8_t vqaddq_qs16(qint16x8_t a, qint16x8_t b) +{ + return vqaddq_s16(a, b); +} + +inline int16x4_t vpaddl_qs8(qint8x8_t a) +{ + return vpaddl_s8(a); +} + +inline qint8x8_t vsub_qs8(qint8x8_t a, qint8x8_t b) +{ + return vsub_s8(a, b); +} + +inline qint8x16_t vsubq_qs8(qint8x16_t a, qint8x16_t b) +{ + return vsubq_s8(a, b); +} + +inline qint8x8_t vqsub_qs8(qint8x8_t a, qint8x8_t b) +{ + return vqsub_s8(a, b); +} + +inline qint8x16_t vqsubq_qs8(qint8x16_t a, qint8x16_t b) +{ + return vqsubq_s8(a, b); +} + +inline qint8x8_t vmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary result with a constant used to round up the result + qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + res = vmlal_s8(res, a, b); + + // Shift right by fixed_point_position + res = vshlq_s16(res, fixed_point_position_s16); + + // Convert back to qint8 + return vmovn_s16(res); +} + +inline qint8x16_t vmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1)); + qint16x8_t res1 = res0; + + // Vector multiply-accumulate long + res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b)); + res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b)); + + // Shift right by fixed_point_position + res0 = vshlq_s16(res0, fixed_point_position_s16); + res1 = vshlq_s16(res1, fixed_point_position_s16); + + // Convert back to qint8 + return vcombine_s8(vmovn_s16(res0), vmovn_s16(res1)); +} + +inline qint8x8_t vqmul_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary result with a constant used to round up the result + qint16x8_t res = vdupq_n_s16(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + res = vmlal_s8(res, a, b); + + // Shift right by fixed_point_position + res = vqshlq_s16(res, fixed_point_position_s16); + + // Convert back to qint8 and saturate + return vqmovn_s16(res); +} + +inline qint8x16_t vqmulq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t res0 = vdupq_n_s16(1 << (fixed_point_position - 1)); + qint16x8_t res1 = res0; + + // Vector multiply-accumulate long + res0 = vmlal_s8(res0, vget_low_s8(a), vget_low_s8(b)); + res1 = vmlal_s8(res1, vget_high_s8(a), vget_high_s8(b)); + + // Shift right by fixed_point_position + res0 = vqshlq_s16(res0, fixed_point_position_s16); + res1 = vqshlq_s16(res1, fixed_point_position_s16); + + // Convert back to qint8 and saturate + return vcombine_s8(vqmovn_s16(res0), vqmovn_s16(res1)); +} + +inline qint16x8_t vmull_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + qint16x8_t res = vmull_s8(a, b); + + return vqrshlq_s16(res, fixed_point_position_s16); +} + +inline qint8x8_t vmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + tmp = vmlal_s8(tmp, b, c); + + // Shift right by fixed_point_position + tmp = vshlq_s16(tmp, fixed_point_position_s16); + + // Convert back to qint8 and accumulate + return vadd_s8(a, vmovn_s16(tmp)); +} + +inline qint8x16_t vmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1)); + qint16x8_t tmp1 = tmp0; + + // Vector multiply-accumulate long + tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c)); + tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c)); + + // Shift right by fixed_point_position + tmp0 = vshlq_s16(tmp0, fixed_point_position_s16); + tmp1 = vshlq_s16(tmp1, fixed_point_position_s16); + + // Convert back to qint8 and accumulate + return vcombine_s8(vadd_s8(vget_low_s8(a), vmovn_s16(tmp0)), vadd_s8(vget_high_s8(a), vmovn_s16(tmp1))); +} + +inline qint8x8_t vqmla_qs8(qint8x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + tmp = vmlal_s8(tmp, b, c); + + // Shift right by fixed_point_position + tmp = vqshlq_s16(tmp, fixed_point_position_s16); + + // Convert back to qint8 and accumulate + return vqadd_s8(a, vqmovn_s16(tmp)); +} + +inline qint8x16_t vqmlaq_qs8(qint8x16_t a, qint8x16_t b, qint8x16_t c, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t tmp0 = vdupq_n_s16(1 << (fixed_point_position - 1)); + qint16x8_t tmp1 = tmp0; + + // Vector multiply-accumulate long + tmp0 = vmlal_s8(tmp0, vget_low_s8(b), vget_low_s8(c)); + tmp1 = vmlal_s8(tmp1, vget_high_s8(b), vget_high_s8(c)); + + // Shift right by fixed_point_position + tmp0 = vqshlq_s16(tmp0, fixed_point_position_s16); + tmp1 = vqshlq_s16(tmp1, fixed_point_position_s16); + + // Convert back to qint8 and accumulate + qint8x16_t res = vcombine_s8(vqmovn_s16(tmp0), vqmovn_s16(tmp1)); + return vqaddq_s8(a, res); +} + +inline qint16x8_t vmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + tmp = vmlal_s8(tmp, b, c); + + // Shift right by fixed_point_position + tmp = vshlq_s16(tmp, fixed_point_position_s16); + + // Accumulate + return vaddq_s16(a, tmp); +} + +inline qint16x8_t vqmlal_qs8(qint16x8_t a, qint8x8_t b, qint8x8_t c, int fixed_point_position) +{ + const int16x8_t fixed_point_position_s16 = vdupq_n_s16(-fixed_point_position); + + // Initialize the temporary results with a constant used to round up the result + qint16x8_t tmp = vdupq_n_s16(1 << (fixed_point_position - 1)); + + // Vector multiply-accumulate long + tmp = vmlal_s8(tmp, b, c); + + // Shift right by fixed_point_position + tmp = vqshlq_s16(tmp, fixed_point_position_s16); + + // Accumulate + return vqaddq_s16(a, tmp); +} + +inline qint8x8_t vcvt_qs8_f32(const float32x4x2_t &a, int fixed_point_position) +{ + const float32x4_t pow2 = vdupq_n_f32(static_cast(1 << fixed_point_position)); + + float32x4x2_t res_f32 = + { + { + vdupq_n_f32(0.5f), + vdupq_n_f32(0.5f) + } + }; + + res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2); + res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2); + + const int32x4x2_t res_s32 = + { + { + vcvtq_s32_f32(res_f32.val[0]), + vcvtq_s32_f32(res_f32.val[1]), + } + }; + + const int16x8_t res_s16 = vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])); + + return vqmovn_s16(res_s16); +} + +inline qint8x16_t vcvtq_qs8_f32(const float32x4x4_t &a, int fixed_point_position) +{ + const float32x4_t pow2 = vdupq_n_f32(static_cast(1 << fixed_point_position)); + + float32x4x4_t res_f32 = + { + { + vdupq_n_f32(0.5f), + vdupq_n_f32(0.5f), + vdupq_n_f32(0.5f), + vdupq_n_f32(0.5f) + } + }; + + res_f32.val[0] = vmlaq_f32(res_f32.val[0], a.val[0], pow2); + res_f32.val[1] = vmlaq_f32(res_f32.val[1], a.val[1], pow2); + res_f32.val[2] = vmlaq_f32(res_f32.val[2], a.val[2], pow2); + res_f32.val[3] = vmlaq_f32(res_f32.val[3], a.val[3], pow2); + + const int32x4x4_t res_s32 = + { + { + vcvtq_s32_f32(res_f32.val[0]), + vcvtq_s32_f32(res_f32.val[1]), + vcvtq_s32_f32(res_f32.val[2]), + vcvtq_s32_f32(res_f32.val[3]), + } + }; + + const int16x8x2_t res_s16 = + { + { + vcombine_s16(vqmovn_s32(res_s32.val[0]), vqmovn_s32(res_s32.val[1])), + vcombine_s16(vqmovn_s32(res_s32.val[2]), vqmovn_s32(res_s32.val[3])), + } + }; + + return vcombine_s8(vqmovn_s16(res_s16.val[0]), vqmovn_s16(res_s16.val[1])); +} + +inline float32x4x2_t vcvt_f32_qs8(qint8x8_t a, int fixed_point_position) +{ + const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position)); + + const int16x8_t res_s16 = vmovl_s8(a); + + const int32x4x2_t res_s32 = + { + { + vmovl_s16(vget_low_s16(res_s16)), + vmovl_s16(vget_high_s16(res_s16)) + } + }; + + float32x4x2_t res_f32 = + { + { + vcvtq_f32_s32(res_s32.val[0]), + vcvtq_f32_s32(res_s32.val[1]) + } + }; + + res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2); + res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2); + + return res_f32; +} + +inline float32x4x4_t vcvtq_f32_qs8(qint8x16_t a, int fixed_point_position) +{ + const float32x4_t pow2 = vdupq_n_f32(1.0f / (1 << fixed_point_position)); + + const int16x8x2_t res_s16 = + { + { + vmovl_s8(vget_low_s8(a)), + vmovl_s8(vget_high_s8(a)), + } + }; + + const int32x4x4_t res_s32 = + { + { + vmovl_s16(vget_low_s16(res_s16.val[0])), + vmovl_s16(vget_high_s16(res_s16.val[0])), + vmovl_s16(vget_low_s16(res_s16.val[1])), + vmovl_s16(vget_high_s16(res_s16.val[1])), + } + }; + + float32x4x4_t res_f32 = + { + { + vcvtq_f32_s32(res_s32.val[0]), + vcvtq_f32_s32(res_s32.val[1]), + vcvtq_f32_s32(res_s32.val[2]), + vcvtq_f32_s32(res_s32.val[3]) + } + }; + + res_f32.val[0] = vmulq_f32(res_f32.val[0], pow2); + res_f32.val[1] = vmulq_f32(res_f32.val[1], pow2); + res_f32.val[2] = vmulq_f32(res_f32.val[2], pow2); + res_f32.val[3] = vmulq_f32(res_f32.val[3], pow2); + + return res_f32; +} + +inline qint8x8_t vrecip_qs8(qint8x8_t a, int fixed_point_position) +{ + // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 + const qint8x8_t const_48_over_17 = vdup_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823 + const qint8x8_t const_minus_32_over_17 = vdup_n_s8(-(0x3C >> (5 - fixed_point_position))); // -1.8823 + const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); + + // Find shift value + const qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); + const qint8x8_t temp = vshl_s8(a, shift_value); + + qint8x8_t x = vadd_s8(const_48_over_17, vmul_qs8(temp, const_minus_32_over_17, fixed_point_position)); + + uint8x8_t set_one = vcgt_s8(x, const_one); + x = vbsl_s8(set_one, const_one, x); + + // Use three iterations of Newton-Raphson method to get the result + x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vadd_s8(x, vmul_qs8(x, vsub_s8(const_one, vmul_qs8(temp, x, fixed_point_position)), fixed_point_position)); + + return vshl_s8(x, shift_value); +} + +inline qint8x16_t vrecipq_qs8(qint8x16_t a, int fixed_point_position) +{ + // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 + const qint8x16_t const_48_over_17 = vdupq_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823 + const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823 + const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); + + // Find shift value + const qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); + const qint8x16_t temp = vshlq_s8(a, shift_value); + + qint8x16_t x = vsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position)); + + // Set initial guess to one if x > 1 + uint8x16_t set_one = vcgtq_s8(x, const_one); + x = vbslq_s8(set_one, const_one, x); + + // Use three iterations of Newton-Raphson method to get the result + x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vaddq_s8(x, vmulq_qs8(x, vsubq_s8(const_one, vmulq_qs8(temp, x, fixed_point_position)), fixed_point_position)); + + return vshlq_s8(x, shift_value); +} + +inline qint8x16_t vqrecipq_qs8(qint8x16_t a, int fixed_point_position) +{ + // We need two bits to store 2, thus we can only support formats from Q2.5 to Q7.0 + const qint8x16_t const_48_over_17 = vdupq_n_s8(0x7A >> (5 - fixed_point_position)); // 2.823 + const qint8x16_t const_minus_32_over_17 = vdupq_n_s8((0x3C >> (5 - fixed_point_position))); // -1.8823 + const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); + + // Find shift value + const qint8x16_t shift_value = vqnegq_s8(vqsubq_s8(vdupq_n_s8(8), vqaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); + const qint8x16_t temp = vqshlq_s8(a, shift_value); + + qint8x16_t x = vqsubq_qs8(const_48_over_17, vmulq_qs8(temp, const_minus_32_over_17, fixed_point_position)); + + // Set initial guess to one if x > 1 + uint8x16_t set_one = vcgtq_s8(x, const_one); + x = vbslq_s8(set_one, const_one, x); + + // Use three iterations of Newton-Raphson method to get the result + x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position)); + x = vqaddq_s8(x, vqmulq_qs8(x, vqsubq_s8(const_one, vqmulq_qs8(temp, x, fixed_point_position)), fixed_point_position)); + + return vqshlq_s8(x, shift_value); +} + +inline qint8x8_t vdiv_qs8(qint8x8_t a, qint8x8_t b, int fixed_point_position) +{ + return vmul_qs8(a, vrecip_qs8(b, fixed_point_position), fixed_point_position); +} + +inline qint8x16_t vdivq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) +{ + return vmulq_qs8(a, vrecipq_qs8(b, fixed_point_position), fixed_point_position); +} + +template +inline qint8x8_t vtaylor_poly_qs8(int8x8_t a, int fixed_point_position) +{ + const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position)); + const qint8x8_t const_one = vdup_n_s8(1); + const qint8x8_t A = vrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vadd_s8(shift_value, const_one) : shift_value); + const qint8x8_t B = vrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value); + const qint8x8_t C = vrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value); + const qint8x8_t D = vrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value); + const qint8x8_t x1 = vadd_s8(vmul_qs8(a, D, fixed_point_position), C); + const qint8x8_t x2 = vadd_s8(vmul_qs8(a, x1, fixed_point_position), B); + const qint8x8_t x3 = vadd_s8(vmul_qs8(a, x2, fixed_point_position), A); + const qint8x8_t res = vmul_qs8(a, x3, fixed_point_position); + return res; +} + +template +inline qint8x8_t vqtaylor_poly_qs8(int8x8_t a, int fixed_point_position) +{ + const qint8x8_t shift_value = vdup_n_s8(-(7 - fixed_point_position)); + const qint8x8_t const_one = vdup_n_s8(1); + const qint8x8_t A = vqrshl_s8(islog ? log_tab_qs8[0] : exp_tab_qs8[0], islog ? vqadd_s8(shift_value, const_one) : shift_value); + const qint8x8_t B = vqrshl_s8(islog ? log_tab_qs8[1] : exp_tab_qs8[1], shift_value); + const qint8x8_t C = vqrshl_s8(islog ? log_tab_qs8[2] : exp_tab_qs8[2], shift_value); + const qint8x8_t D = vqrshl_s8(islog ? log_tab_qs8[3] : exp_tab_qs8[3], shift_value); + const qint8x8_t x1 = vqadd_s8(vqmul_qs8(a, D, fixed_point_position), C); + const qint8x8_t x2 = vqadd_s8(vqmul_qs8(a, x1, fixed_point_position), B); + const qint8x8_t x3 = vqadd_s8(vqmul_qs8(a, x2, fixed_point_position), A); + const qint8x8_t res = vqmul_qs8(a, x3, fixed_point_position); + return res; +} + +template +inline qint8x16_t vtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position) +{ + const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position)); + const qint8x16_t const_one = vdupq_n_s8(1); + const qint8x16_t A = vrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vaddq_s8(shift_value, const_one) : shift_value); + const qint8x16_t B = vrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value); + const qint8x16_t C = vrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value); + const qint8x16_t D = vrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value); + const qint8x16_t x1 = vaddq_s8(vmulq_qs8(a, D, fixed_point_position), C); + const qint8x16_t x2 = vaddq_s8(vmulq_qs8(a, x1, fixed_point_position), B); + const qint8x16_t x3 = vaddq_s8(vmulq_qs8(a, x2, fixed_point_position), A); + const qint8x16_t res = vmulq_qs8(a, x3, fixed_point_position); + return res; +} + +template +inline qint8x16_t vqtaylor_polyq_qs8(qint8x16_t a, int fixed_point_position) +{ + const qint8x16_t shift_value = vdupq_n_s8(-(7 - fixed_point_position)); + const qint8x16_t const_one = vdupq_n_s8(1); + const qint8x16_t A = vqrshlq_s8(islog ? log_tabq_qs8[0] : exp_tabq_qs8[0], islog ? vqaddq_s8(shift_value, const_one) : shift_value); + const qint8x16_t B = vqrshlq_s8(islog ? log_tabq_qs8[1] : exp_tabq_qs8[1], shift_value); + const qint8x16_t C = vqrshlq_s8(islog ? log_tabq_qs8[2] : exp_tabq_qs8[2], shift_value); + const qint8x16_t D = vqrshlq_s8(islog ? log_tabq_qs8[3] : exp_tabq_qs8[3], shift_value); + const qint8x16_t x1 = vqaddq_s8(vqmulq_qs8(a, D, fixed_point_position), C); + const qint8x16_t x2 = vqaddq_s8(vqmulq_qs8(a, x1, fixed_point_position), B); + const qint8x16_t x3 = vqaddq_s8(vqmulq_qs8(a, x2, fixed_point_position), A); + const qint8x16_t res = vqmulq_qs8(a, x3, fixed_point_position); + return res; +} + +inline qint8x8_t vqexp_qs8(qint8x8_t a, int fixed_point_position) +{ + const qint8x8_t shift_value = vdup_n_s8(fixed_point_position - 7); + const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); + const qint8x8_t const_ln2 = vqrshl_s8(vdup_n_s8(0x58), shift_value); // ln(2) + const qint8x8_t const_inv_ln2 = vorr_s8(vqrshl_s8(vdup_n_s8(0x38), shift_value), const_one); // 1/ln(2) + + // Perform range reduction [-log(2),log(2)] + const qint8x8_t m = vqmul_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2) + + // get decimal part from m + const qint8x8_t dec_m = vqshl_s8(m, vdup_n_s8(-fixed_point_position)); + + qint8x8_t alpha = vqmul_qs8(vqshl_s8(dec_m, vdup_n_s8(fixed_point_position)), const_ln2, fixed_point_position); + alpha = vqabs_qs8(vqsub_s8(a, alpha)); + + // Polynomial Approximation + qint8x8_t poly = vqtaylor_poly_qs8(alpha, fixed_point_position); + poly = vqadd_s8(poly, const_one); + + // Reconstruct + poly = vqshl_s8(poly, dec_m); + + return poly; +} + +inline qint8x16_t vqexpq_qs8(qint8x16_t a, int fixed_point_position) +{ + const qint8x16_t shift_value = vdupq_n_s8(fixed_point_position - 7); + const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); + const qint8x16_t const_ln2 = vqrshlq_s8(vdupq_n_s8(0x58), shift_value); // ln(2) + const qint8x16_t const_inv_ln2 = vorrq_s8(vqrshlq_s8(vdupq_n_s8(0x38), shift_value), const_one); // 1/ln(2) + + // Perform range reduction [-log(2),log(2)] + const qint8x16_t m = vqmulq_qs8(a, const_inv_ln2, fixed_point_position); // x / ln(2) + + // get decimal part from m + const qint8x16_t dec_m = vqshlq_s8(m, vdupq_n_s8(-fixed_point_position)); + + qint8x16_t alpha = vqmulq_qs8(vqshlq_s8(dec_m, vdupq_n_s8(fixed_point_position)), const_ln2, fixed_point_position); + alpha = vqabsq_qs8(vqsubq_qs8(a, alpha)); + + // Polynomial Approximation + qint8x16_t poly = vqtaylor_polyq_qs8(alpha, fixed_point_position); + poly = vqaddq_s8(poly, const_one); + + // Reconstruct + poly = vqshlq_s8(poly, dec_m); + + return poly; +} + +inline qint8x8_t vlog_qs8(qint8x8_t a, int fixed_point_position) +{ + const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); + const qint8x8_t const_seven_dec = vdup_n_s8(7); + const qint8x8_t const_ln2 = vdup_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2) + + // If 0 < a < 1, calculate log(1/x) + uint8x8_t calc_reciprocal = vclt_s8(a, const_one); + qint8x8_t recip = vdup_n_s8(0); + recip = vbsl_s8(calc_reciprocal, recip, a); + + // Calculate reciprocal + recip = vrecip_qs8(recip, fixed_point_position); + a = vbsl_s8(calc_reciprocal, recip, a); + + // Get decimal part of a + qint8x8_t shift_value = vdup_n_s8(-fixed_point_position); + qint8x8_t dec_a = vshl_s8(a, shift_value); // a >> fixed_point_position + + // Get exponent of 2^n which is equal or less than dec_a + shift_value = vsub_s8(const_seven_dec, vclz_s8(dec_a)); + + // Get x to range (1, 2] + const qint8x8_t shift_value_neg = vneg_s8(shift_value); + const qint8x8_t temp = vsub_s8(vrshl_s8(a, shift_value_neg), const_one); + const qint8x8_t sum = vmul_s8(shift_value, const_one); + + // Polynomial Approximation + qint8x8_t poly = vtaylor_poly_qs8(temp, fixed_point_position); + + // Reconstruct + poly = vmul_qs8(vadd_s8(poly, sum), const_ln2, fixed_point_position); + + // Set negative value for 0 < a < 1 + poly = vbsl_s8(calc_reciprocal, vneg_s8(poly), poly); + + return poly; +} + +inline qint8x16_t vlogq_qs8(qint8x16_t a, int fixed_point_position) +{ + const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); + const qint8x16_t const_seven_dec = vdupq_n_s8(7); + const qint8x16_t const_ln2 = vdupq_n_s8(0x58 >> (7 - fixed_point_position)); // ln(2) + + // If 0 < a < 1, calculate log(1/x) + uint8x16_t calc_reciprocal = vcltq_s8(a, const_one); + qint8x16_t recip = vdupq_n_s8(0); + recip = vbslq_s8(calc_reciprocal, a, recip); + + // Calculate reciprocal + recip = vrecipq_qs8(recip, fixed_point_position); + a = vbslq_s8(calc_reciprocal, recip, a); + + // Get decimal part of a + qint8x16_t shift_value = vdupq_n_s8(-fixed_point_position); + qint8x16_t dec_a = vshlq_s8(a, shift_value); // a >> fixed_point_position + + // Get exponent of 2^n which is equal or less than dec_a + shift_value = vsubq_s8(const_seven_dec, vclzq_s8(dec_a)); + + // Get x to range (1, 2] + const qint8x16_t shift_value_neg = vnegq_s8(shift_value); + const qint8x16_t temp = vsubq_s8(vrshlq_s8(a, shift_value_neg), const_one); + const qint8x16_t sum = vmulq_s8(shift_value, const_one); + + // Polynomial Approximation + qint8x16_t poly = vtaylor_polyq_qs8(temp, fixed_point_position); + + // Reconstruct + poly = vmulq_qs8(vaddq_s8(poly, sum), const_ln2, fixed_point_position); + + // Set negative value for 0 < a < 1 + poly = vbslq_s8(calc_reciprocal, vnegq_s8(poly), poly); + + return poly; +} + +inline qint8x8_t vinvsqrt_qs8(qint8x8_t a, int fixed_point_position) +{ + const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position); + + // Find shift value. Number must be in (0.5, 2) range. + qint8x8_t shift_value = vneg_s8(vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); + + // Add one when the shift value is negative in order to get the correct result when we shift right with 1 + qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))); + uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0)); + temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp); + qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1)); + + temp = vshl_s8(a, shift_value); + + // Initial guess + qint8x8_t x = temp; + + // Calculate (x / 2) * (3 - a * x^2) + // After three iterations we have the result for 8 bit + x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshr_n_s8(vmul_qs8(x, vsub_s8(const_three, vmul_qs8(temp, vmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + + return vshl_s8(x, shift_value2); +} + +inline qint8x8_t vqinvsqrt_qs8(qint8x8_t a, int fixed_point_position) +{ + const qint8x8_t const_three = vdup_n_s8(3 << fixed_point_position); + + // Find shift value. Number must be in (0.5, 2) range. + qint8x8_t shift_value = vneg_s8(vqsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position)))); + + // Add one when the shift value is negative in order to get the correct result when we shift right with 1 + qint8x8_t temp = vsub_s8(vdup_n_s8(8), vadd_s8(vclz_s8(a), vdup_n_s8(fixed_point_position))); + uint8x8_t temp_ltz = vclt_s8(temp, vdup_n_qs8(0)); + temp = vbsl_s8(temp_ltz, vadd_s8(temp, vdup_n_s8(1)), temp); + qint8x8_t shift_value2 = vneg_s8(vshr_n_s8(temp, 1)); + + temp = vshl_s8(a, shift_value); + + // Initial guess + qint8x8_t x = temp; + + // Calculate (x / 2) * (3 - a * x^2) + // After three iterations we have the result for 8 bit + x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshr_n_s8(vqmul_qs8(x, vqsub_s8(const_three, vqmul_qs8(temp, vqmul_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + + return vshl_s8(x, shift_value2); +} + +inline qint8x16_t vinvsqrtq_qs8(qint8x16_t a, int fixed_point_position) +{ + const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position); + + // Find shift value. Number must be in (0.5, 2) range. + qint8x16_t shift_value = vnegq_s8(vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); + + // Add one when the shift value is negative in order to get the correct result when we shift right with 1 + qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))); + uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0)); + temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp); + qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1)); + + temp = vshlq_s8(a, shift_value); + + // Initial guess + qint8x16_t x = temp; + + // Calculate (x / 2) * (3 - a * x^2) + // After three iterations we have the result for 8 bit + x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshrq_n_s8(vmulq_qs8(x, vsubq_s8(const_three, vmulq_qs8(temp, vmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + + return vshlq_s8(x, shift_value2); +} + +inline qint8x16_t vqinvsqrtq_qs8(qint8x16_t a, int fixed_point_position) +{ + const qint8x16_t const_three = vdupq_n_s8(3 << fixed_point_position); + + // Find shift value. Number must be in (0.5, 2) range. + qint8x16_t shift_value = vnegq_s8(vqsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position)))); + + // Add one when the shift value is negative in order to get the correct result when we shift right with 1 + qint8x16_t temp = vsubq_s8(vdupq_n_s8(8), vaddq_s8(vclzq_s8(a), vdupq_n_s8(fixed_point_position))); + uint8x16_t temp_ltz = vcltq_s8(temp, vdupq_n_qs8(0)); + temp = vbslq_s8(temp_ltz, vaddq_s8(temp, vdupq_n_s8(1)), temp); + qint8x16_t shift_value2 = vnegq_s8(vshrq_n_s8(temp, 1)); + + temp = vshlq_s8(a, shift_value); + + // Initial guess + qint8x16_t x = temp; + + // Calculate (x / 2) * (3 - a * x^2) + // After three iterations we have the result for 8 bit + x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + x = vshrq_n_s8(vqmulq_qs8(x, vqsubq_s8(const_three, vqmulq_qs8(temp, vqmulq_qs8(x, x, fixed_point_position), fixed_point_position)), fixed_point_position), 1); + + return vshlq_s8(x, shift_value2); +} + +inline qint8x8_t vtanh_qs8(qint8x8_t a, int fixed_point_position) +{ + const qint8x8_t const_one = vdup_n_s8(1 << fixed_point_position); + const qint8x8_t const_two = vdup_n_s8(2 << fixed_point_position); + + qint8x8_t exp2x = vqexp_qs8(vqmul_qs8(const_two, a, fixed_point_position), fixed_point_position); + qint8x8_t num = vqsub_qs8(exp2x, const_one); + qint8x8_t den = vqadd_qs8(exp2x, const_one); + qint8x8_t tanh = vqmul_qs8(num, vrecip_qs8(den, fixed_point_position), fixed_point_position); + + return tanh; +} + +inline qint8x16_t vtanhq_qs8(qint8x16_t a, int fixed_point_position) +{ + const qint8x16_t const_one = vdupq_n_s8(1 << fixed_point_position); + const qint8x16_t const_two = vdupq_n_s8(2 << fixed_point_position); + + qint8x16_t exp2x = vqexpq_qs8(vqmulq_qs8(const_two, a, fixed_point_position), fixed_point_position); + qint8x16_t num = vqsubq_qs8(exp2x, const_one); + qint8x16_t den = vqaddq_qs8(exp2x, const_one); + qint8x16_t tanh = vqmulq_qs8(num, vqrecipq_qs8(den, fixed_point_position), fixed_point_position); + + return tanh; +} + +inline qint8x16_t vqpowq_qs8(qint8x16_t a, qint8x16_t b, int fixed_point_position) +{ + return vqexpq_qs8(vqmulq_qs8(b, vlogq_qs8(a, fixed_point_position), fixed_point_position), fixed_point_position); +} +} diff --git a/arm_compute/core/NEON/NEKernels.h b/arm_compute/core/NEON/NEKernels.h new file mode 100644 index 0000000000..eaa50f123b --- /dev/null +++ b/arm_compute/core/NEON/NEKernels.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEKERNELS_H__ +#define __ARM_COMPUTE_NEKERNELS_H__ + +/* Header regrouping all the NEON kernels */ +#include "arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h" +#include "arm_compute/core/NEON/kernels/NEAccumulateKernel.h" +#include "arm_compute/core/NEON/kernels/NEActivationLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h" +#include "arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h" +#include "arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h" +#include "arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h" +#include "arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h" +#include "arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h" +#include "arm_compute/core/NEON/kernels/NEBox3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NECannyEdgeKernel.h" +#include "arm_compute/core/NEON/kernels/NEChannelCombineKernel.h" +#include "arm_compute/core/NEON/kernels/NEChannelExtractKernel.h" +#include "arm_compute/core/NEON/kernels/NECol2ImKernel.h" +#include "arm_compute/core/NEON/kernels/NEColorConvertKernel.h" +#include "arm_compute/core/NEON/kernels/NEConvolutionKernel.h" +#include "arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h" +#include "arm_compute/core/NEON/kernels/NEDepthConvertKernel.h" +#include "arm_compute/core/NEON/kernels/NEDerivativeKernel.h" +#include "arm_compute/core/NEON/kernels/NEDilateKernel.h" +#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h" +#include "arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEErodeKernel.h" +#include "arm_compute/core/NEON/kernels/NEFastCornersKernel.h" +#include "arm_compute/core/NEON/kernels/NEFillArrayKernel.h" +#include "arm_compute/core/NEON/kernels/NEFillBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h" +#include "arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h" +#include "arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h" +#include "arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h" +#include "arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h" +#include "arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h" +#include "arm_compute/core/NEON/kernels/NEHistogramKernel.h" +#include "arm_compute/core/NEON/kernels/NEIm2ColKernel.h" +#include "arm_compute/core/NEON/kernels/NEIntegralImageKernel.h" +#include "arm_compute/core/NEON/kernels/NELKTrackerKernel.h" +#include "arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h" +#include "arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h" +#include "arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h" +#include "arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h" +#include "arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h" +#include "arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h" +#include "arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NERemapKernel.h" +#include "arm_compute/core/NEON/kernels/NEScaleKernel.h" +#include "arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NESobel3x3Kernel.h" +#include "arm_compute/core/NEON/kernels/NESobel5x5Kernel.h" +#include "arm_compute/core/NEON/kernels/NESobel7x7Kernel.h" +#include "arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h" +#include "arm_compute/core/NEON/kernels/NETableLookupKernel.h" +#include "arm_compute/core/NEON/kernels/NEThresholdKernel.h" +#include "arm_compute/core/NEON/kernels/NETransposeKernel.h" +#include "arm_compute/core/NEON/kernels/NEWarpKernel.h" +#include "arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h" + +#endif /* __ARM_COMPUTE_NEKERNELS_H__ */ diff --git a/arm_compute/core/NEON/NEMath.h b/arm_compute/core/NEON/NEMath.h new file mode 100644 index 0000000000..bb8a330c1e --- /dev/null +++ b/arm_compute/core/NEON/NEMath.h @@ -0,0 +1,96 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEMATH_H__ +#define __ARM_COMPUTE_NEMATH_H__ + +#include + +namespace arm_compute +{ +/** Calculate inverse square root. + * + * @param[in] x Input value. + * + * @return The calculated inverse square root. + */ +float32x4_t vinvsqrtq_f32(float32x4_t x); + +/** Calculate reciprocal. + * + * @param[in] x Input value. + * + * @return The calculated reciprocal. + */ +float32x4_t vinvq_f32(float32x4_t x); + +/** Perform a 7th degree polynomial approximation using Estrin's method. + * + * @param[in] x Input vector value in F32 format. + * @param[in] coeffs Polynomial coefficients table. + * + * @return The calculated approximation. + */ +float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs); + +/** Calculate exponential + * + * @param[in] x Input vector value in F32 format. + * + * @return The calculated exponent. + */ +float32x4_t vexpq_f32(float32x4_t x); + +/** Calculate logarithm + * + * @param[in] x Input vector value in F32 format. + * + * @return The calculated logarithm. + */ +float32x4_t vlogq_f32(float32x4_t x); + +/** Calculate hyperbolic tangent. + * + * tanh(x) = (e^2x - 1)/(e^2x + 1) + * + * @note We clamp x to [-5,5] to avoid overflowing issues. + * + * @param[in] val Input vector value in F32 format. + * + * @return The calculated Hyperbolic Tangent. + */ +float32x4_t vtanhq_f32(float32x4_t val); + +/** Calculate n power of a number. + * + * pow(x,n) = e^(n*log(x)) + * + * @param[in] val Input vector value in F32 format. + * @param[in] n Powers to raise the input to. + * + * @return The calculated power. + */ +float32x4_t vpowq_f32(float32x4_t val, float32x4_t n); +} +#include "arm_compute/core/NEON/NEMath.inl" +#endif /* __ARM_COMPUTE_NEMATH_H__ */ diff --git a/arm_compute/core/NEON/NEMath.inl b/arm_compute/core/NEON/NEMath.inl new file mode 100644 index 0000000000..a31a4c0dc5 --- /dev/null +++ b/arm_compute/core/NEON/NEMath.inl @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +namespace arm_compute +{ +/* Exponent polynomial coefficients */ +const std::array exp_tab = +{ + { + vdupq_n_f32(1.f), + vdupq_n_f32(0.0416598916054f), + vdupq_n_f32(0.500000596046f), + vdupq_n_f32(0.0014122662833f), + vdupq_n_f32(1.00000011921f), + vdupq_n_f32(0.00833693705499f), + vdupq_n_f32(0.166665703058f), + vdupq_n_f32(0.000195780929062f), + } +}; + +/* Logarithm polynomial coefficients */ +const std::array log_tab = +{ + { + vdupq_n_f32(-2.29561495781f), + vdupq_n_f32(-2.47071170807f), + vdupq_n_f32(-5.68692588806f), + vdupq_n_f32(-0.165253549814f), + vdupq_n_f32(5.17591238022f), + vdupq_n_f32(0.844007015228f), + vdupq_n_f32(4.58445882797f), + vdupq_n_f32(0.0141278216615f), + } +}; + +inline float32x4_t vinvsqrtq_f32(float32x4_t x) +{ + float32x4_t sqrt_reciprocal = vrsqrteq_f32(x); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + sqrt_reciprocal = vmulq_f32(vrsqrtsq_f32(vmulq_f32(x, sqrt_reciprocal), sqrt_reciprocal), sqrt_reciprocal); + + return sqrt_reciprocal; +} + +inline float32x4_t vinvq_f32(float32x4_t x) +{ + float32x4_t recip = vrecpeq_f32(x); + recip = vmulq_f32(vrecpsq_f32(x, recip), recip); + recip = vmulq_f32(vrecpsq_f32(x, recip), recip); + return recip; +} + +inline float32x4_t vtaylor_polyq_f32(float32x4_t x, const std::array &coeffs) +{ + float32x4_t A = vmlaq_f32(coeffs[0], coeffs[4], x); + float32x4_t B = vmlaq_f32(coeffs[2], coeffs[6], x); + float32x4_t C = vmlaq_f32(coeffs[1], coeffs[5], x); + float32x4_t D = vmlaq_f32(coeffs[3], coeffs[7], x); + float32x4_t x2 = vmulq_f32(x, x); + float32x4_t x4 = vmulq_f32(x2, x2); + float32x4_t res = vmlaq_f32(vmlaq_f32(A, B, x2), vmlaq_f32(C, D, x2), x4); + return res; +} + +inline float32x4_t vexpq_f32(float32x4_t x) +{ + static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) + static const float32x4_t CONST_INV_LN2 = vdupq_n_f32(1.4426950408f); // 1/ln(2) + + // Perform range reduction [-log(2),log(2)] + int32x4_t m = vcvtq_s32_f32(vmulq_f32(x, CONST_INV_LN2)); + float32x4_t val = vmlsq_f32(x, vcvtq_f32_s32(m), CONST_LN2); + + // Polynomial Approximation + float32x4_t poly = vtaylor_polyq_f32(val, exp_tab); + + // Reconstruct + poly = vreinterpretq_f32_s32(vaddq_s32(vreinterpretq_s32_f32(poly), vshlq_n_s32(m, 23))); + + return poly; +} + +inline float32x4_t vlogq_f32(float32x4_t x) +{ + static const int32x4_t CONST_127 = vdupq_n_s32(127); // 127 + static const float32x4_t CONST_LN2 = vdupq_n_f32(0.6931471805f); // ln(2) + + // Extract exponent + int32x4_t m = vsubq_s32(vreinterpretq_s32_u32(vshrq_n_u32(vreinterpretq_u32_f32(x), 23)), CONST_127); + float32x4_t val = vreinterpretq_f32_s32(vsubq_s32(vreinterpretq_s32_f32(x), vshlq_n_s32(m, 23))); + + // Polynomial Approximation + float32x4_t poly = vtaylor_polyq_f32(val, log_tab); + + // Reconstruct + poly = vmlaq_f32(poly, vcvtq_f32_s32(m), CONST_LN2); + + return poly; +} + +inline float32x4_t vtanhq_f32(float32x4_t val) +{ + static const float32x4_t CONST_1 = vdupq_n_f32(1.f); + static const float32x4_t CONST_2 = vdupq_n_f32(2.f); + static const float32x4_t CONST_MIN_TANH = vdupq_n_f32(-10.f); + static const float32x4_t CONST_MAX_TANH = vdupq_n_f32(10.f); + + float32x4_t x = vminq_f32(vmaxq_f32(val, CONST_MIN_TANH), CONST_MAX_TANH); + float32x4_t exp2x = vexpq_f32(vmulq_f32(CONST_2, x)); + float32x4_t num = vsubq_f32(exp2x, CONST_1); + float32x4_t den = vaddq_f32(exp2x, CONST_1); + float32x4_t tanh = vmulq_f32(num, vinvq_f32(den)); + return tanh; +} + +inline float32x4_t vpowq_f32(float32x4_t val, float32x4_t n) +{ + return vexpq_f32(vmulq_f32(n, vlogq_f32(val))); +} +} \ No newline at end of file diff --git a/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h new file mode 100644 index 0000000000..9ef93ce67a --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEAbsoluteDifferenceKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__ +#define __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the absolute difference kernel + * + * Absolute difference is computed by: + * @f[ output(x,y) = | input1(x,y) - input2(x,y) | @f] + */ +class NEAbsoluteDifferenceKernel : public INEKernel +{ +public: + /** Default constructor */ + NEAbsoluteDifferenceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEAbsoluteDifferenceKernel(const NEAbsoluteDifferenceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEAbsoluteDifferenceKernel &operator=(const NEAbsoluteDifferenceKernel &) = delete; + /** Allow instances of this class to be moved */ + NEAbsoluteDifferenceKernel(NEAbsoluteDifferenceKernel &&) = default; + /** Allow instances of this class to be moved */ + NEAbsoluteDifferenceKernel &operator=(NEAbsoluteDifferenceKernel &&) = default; + /** Default destructor */ + ~NEAbsoluteDifferenceKernel() = default; + + /** Set the inputs and output tensors + * + * @param[in] input1 Source tensor. Data types supported: U8/S16 + * @param[in] input2 Source tensor. Data types supported: U8/S16 + * @param[out] output Destination tensor, Data types supported: U8/S16 + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the specialised absolute difference functions + * + * @param[in] input1 An input tensor. Data types supported: U8/S16. + * @param[in] input2 An input tensor. Data types supported: U8/S16. + * @param[out] output The output tensor, Data types supported: U8 (Only if both inputs are U8), S16. + * @param[in] window Region on which to execute the kernel. + */ + using AbsDiffFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); + + /** Absolute difference function to use for the particular tensor formats passed to configure() */ + AbsDiffFunction *_func; + const ITensor *_input1; + const ITensor *_input2; + ITensor *_output; +}; +} +#endif /* __ARM_COMPUTE_NEABSOLUTEDIFFERENCEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h new file mode 100644 index 0000000000..df6d7b8891 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEAccumulateKernel.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEACCUMULATEKERNEL_H__ +#define __ARM_COMPUTE_NEACCUMULATEKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** Interface for the accumulate kernel + * + * Accumulation is computed by: + * @f[ accum(x,y) = accum(x,y) + input(x,y) @f] + */ +class NEAccumulateKernel : public INESimpleKernel +{ +public: + /** Set the input and accumulation tensors + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] accum Destination tensor. Data type supported: S16. + */ + void configure(const ITensor *input, ITensor *accum); + + // Inherited methods overridden: + void run(const Window &window) override; +}; + +/** Interface for the accumulate weighted kernel + * + * Weighted accumulation is computed: + * @f[ accum(x,y) = (1 - \alpha)*accum(x,y) + \alpha*input(x,y) @f] + * + * Where @f$ 0 \le \alpha \le 1 @f$ + * Conceptually, the rounding for this is defined as: + * @f[ output(x,y)= uint8( (1 - \alpha) * float32( int32( output(x,y) ) ) + \alpha * float32( int32( input(x,y) ) ) ) @f] +*/ +class NEAccumulateWeightedKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEAccumulateWeightedKernel(); + /** Set the input and accumulation tensors, and the scale value + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[in] alpha Scalar value in the range [0.0f, 1.0f] + * @param[in,out] accum Accumulated tensor. Data type supported: U8. + */ + void configure(const ITensor *input, float alpha, ITensor *accum); + + // Inherited methods overridden: + void run(const Window &window) override; + +protected: + float _alpha; +}; + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/** Interface for the accumulate weighted kernel using F16 */ +class NEAccumulateWeightedFP16Kernel : public NEAccumulateWeightedKernel +{ +public: + // Inherited methods overridden: + void run(const Window &window) override; +}; +#else +using NEAccumulateWeightedFP16Kernel = NEAccumulateWeightedKernel; +#endif + +/** Interface for the accumulate squared kernel + * + * The accumulation of squares is computed: + * @f[ accum(x,y) = saturate_{int16} ( (uint16) accum(x,y) + (((uint16)(input(x,y)^2)) >> (shift)) ) @f] + * + * Where @f$ 0 \le shift \le 15 @f$ +*/ +class NEAccumulateSquaredKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEAccumulateSquaredKernel(); + /** Set the input and accumulation tensors and the shift value. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[in] shift Shift value in the range of [0, 15] + * @param[in,out] accum Accumulated tensor. Data type supported: S16. + */ + void configure(const ITensor *input, uint32_t shift, ITensor *accum); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + uint32_t _shift; +}; +} +#endif /*__ARM_COMPUTE_NEACCUMULATEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h new file mode 100644 index 0000000000..97f92d6a1e --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEActivationLayerKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ + +#include "arm_compute/core/FixedPoint.h" +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the activation layer kernel. */ +class NEActivationLayerKernel : public INESimpleKernel +{ +public: + /** Constructor */ + NEActivationLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEActivationLayerKernel(const NEActivationLayerKernel &) = delete; + /** Default move constructor */ + NEActivationLayerKernel(NEActivationLayerKernel &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEActivationLayerKernel &operator=(const NEActivationLayerKernel &) = delete; + /** Default move assignment operator */ + NEActivationLayerKernel &operator=(NEActivationLayerKernel &&) = default; + /** Set the input and output tensor. + * + * @param[in] input Source tensor. Data types supported: QS8/F32. + * @param[out] output Destination tensor. Data type supported: same as @p input + * @param[in] activation_info Activation layer information. + */ + void configure(const ITensor *input, ITensor *output, ActivationLayerInfo activation_info); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + using ActivationFunction = ActivationLayerInfo::ActivationFunction; + /** Common signature for all the specialised @ref NEActivationLayerKernel functions + * + * @param[in] window Region on which to execute the kernel. + */ + using ActivationFunctionExecutorPtr = void (NEActivationLayerKernel::*)(const Window &window); + /** Function to apply an activation function on a tensor. + * + * @param[in] window Region on which to execute the kernel + */ + template + typename std::enable_if::value, void>::type activation(const Window &window); + /** Function to apply an activation function on a tensor. + * + * @param[in] window Region on which to execute the kernel + */ + template + typename std::enable_if::value, void>::type activation(const Window &window); + +private: + ActivationFunctionExecutorPtr _func; + ActivationLayerInfo _act_info; +}; +} +#endif /*__ARM_COMPUTE_NEACTIVATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h new file mode 100644 index 0000000000..b36ca46e1a --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEArithmeticAdditionKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__ +#define __ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform addition between two tensors */ +class NEArithmeticAdditionKernel : public INEKernel +{ +public: + /** Default constructor */ + NEArithmeticAdditionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticAdditionKernel(const NEArithmeticAdditionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticAdditionKernel &operator=(const NEArithmeticAdditionKernel &) = delete; + /** Allow instances of this class to be moved */ + NEArithmeticAdditionKernel(NEArithmeticAdditionKernel &&) = default; + /** Allow instances of this class to be moved */ + NEArithmeticAdditionKernel &operator=(NEArithmeticAdditionKernel &&) = default; + /** Default destructor */ + ~NEArithmeticAdditionKernel() = default; + + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input1 An input tensor. Data types supported: U8/S16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32). + * @param[in] policy Overflow policy. + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the specialised add functions + * + * @param[in] input1 An input tensor. Data types supported: U8/S16/F32. + * @param[in] input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32). + * @param[in] window Region on which to execute the kernel. + */ + using AddFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); + /** Add function to use for the particular tensor types passed to configure() */ + AddFunction *_func; + const ITensor *_input1; + const ITensor *_input2; + ITensor *_output; +}; +} +#endif /*__ARM_COMPUTE_NEARITHMETICADDITIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h new file mode 100644 index 0000000000..0eb9c23686 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEArithmeticSubtractionKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__ +#define __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform subtraction between two tensors */ +class NEArithmeticSubtractionKernel : public INEKernel +{ +public: + /** Default constructor */ + NEArithmeticSubtractionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticSubtractionKernel(const NEArithmeticSubtractionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEArithmeticSubtractionKernel &operator=(const NEArithmeticSubtractionKernel &) = delete; + /** Allow instances of this class to be moved */ + NEArithmeticSubtractionKernel(NEArithmeticSubtractionKernel &&) = default; + /** Allow instances of this class to be moved */ + NEArithmeticSubtractionKernel &operator=(NEArithmeticSubtractionKernel &&) = default; + /** Default destructor */ + ~NEArithmeticSubtractionKernel() = default; + + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input1 An input tensor. Data types supported: U8/S16/F32 + * @param[in] input2 An input tensor. Data types supported: U8/S16/F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16/F32 (only if both inputs are F32). + * @param[in] policy Overflow policy. + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, ConvertPolicy policy); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the specialised sub functions + * + * @param[in] input1 An input tensor. Data types supported: U8, S16, F32. + * @param[in] input2 An input tensor. Data types supported: U8, S16, F32 (only if @p input1 is F32). + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8), S16, F32 (only if both inputs are F32) + * @param[in] window Region on which to execute the kernel. + */ + using SubFunction = void(const ITensor *input1, const ITensor *input2, ITensor *output, const Window &window); + /** Sub function to use for the particular tensor types passed to configure() */ + SubFunction *_func; + const ITensor *_input1; + const ITensor *_input2; + ITensor *_output; +}; +} +#endif /* __ARM_COMPUTE_NEARITHMETICSUBTRACTIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h new file mode 100644 index 0000000000..29fcbd26a0 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEBatchNormalizationLayerKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the batch normalization layer kernel. + */ +class NEBatchNormalizationLayerKernel : public INEKernel +{ +public: + /** Default constructor */ + NEBatchNormalizationLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBatchNormalizationLayerKernel(const NEBatchNormalizationLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBatchNormalizationLayerKernel &operator=(const NEBatchNormalizationLayerKernel &) = delete; + /** Default Move Constructor. */ + NEBatchNormalizationLayerKernel(NEBatchNormalizationLayerKernel &&) = default; + /** Default move assignment operator. */ + NEBatchNormalizationLayerKernel &operator=(NEBatchNormalizationLayerKernel &&) = default; + /** Default destructor */ + ~NEBatchNormalizationLayerKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dimensions represent a single input with dimensions [width, height, FM]. + * The rest are optional and used for representing batches. Data types supported: QS8/F32. + * @param[in] mean Mean values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] var Variance values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] gamma Gamma values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] beta Beta values tensor. 1 dimension with size equal to the feature maps [FM]. Data types supported: Same as @p input + * @param[in] epsilon Small value to avoid division with zero. + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input + */ + void configure(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + using BatchNormFunction = void(const ITensor *input, ITensor *output, const ITensor *mean, const ITensor *var, const ITensor *beta, const ITensor *gamma, float epsilon, const Window &window); + BatchNormFunction *_func; + const ITensor *_input; + ITensor *_output; + const ITensor *_mean; + const ITensor *_var; + const ITensor *_gamma; + const ITensor *_beta; + float _epsilon; +}; +} +#endif /*__ARM_COMPUTE_NEBATCHNORMALIZATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h new file mode 100644 index 0000000000..b931445419 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEBitwiseAndKernel.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBITWISEANDKERNEL_H__ +#define __ARM_COMPUTE_NEBITWISEANDKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform bitwise AND between XY-planes of two tensors + * + * Result is computed by: + * @f[ output(x,y) = input1(x,y) \land input2(x,y) @f] + */ +class NEBitwiseAndKernel : public INEKernel +{ +public: + /** Default constructor */ + NEBitwiseAndKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseAndKernel(const NEBitwiseAndKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseAndKernel &operator=(const NEBitwiseAndKernel &) = delete; + /** Allow instances of this class to be moved */ + NEBitwiseAndKernel(NEBitwiseAndKernel &&) = default; + /** Allow instances of this class to be moved */ + NEBitwiseAndKernel &operator=(NEBitwiseAndKernel &&) = default; + /** Initialise the kernel's inputs and output + * + * @param[in] input1 An input tensor. Data type supported: U8. + * @param[in] input2 An input tensor. Data type supported: U8 + * @param[out] output Output tensor. Data type supported: U8. + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input1; /**< Source tensor 1 */ + const ITensor *_input2; /**< Source tensor 2 */ + ITensor *_output; /**< Destination tensor */ +}; +} +#endif /* __ARM_COMPUTE_NEBITWISEANDKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h new file mode 100644 index 0000000000..e34eb0f5ae --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEBitwiseNotKernel.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBITWISENOTKERNEL_H__ +#define __ARM_COMPUTE_NEBITWISENOTKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform bitwise NOT operation + * + * Result is computed by: + * @f[ output(x,y) = \lnot input(x,y) @f] + */ +class NEBitwiseNotKernel : public INEKernel +{ +public: + /** Default constructor */ + NEBitwiseNotKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseNotKernel(const NEBitwiseNotKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseNotKernel &operator=(const NEBitwiseNotKernel &) = delete; + /** Allow instances of this class to be moved */ + NEBitwiseNotKernel(NEBitwiseNotKernel &&) = default; + /** Allow instances of this class to be moved */ + NEBitwiseNotKernel &operator=(NEBitwiseNotKernel &&) = default; + /** Initialise the kernel's input and output + * + * @param[in] input An input tensor. Data type supported: U8. + * @param[out] output The output tensor. Data type supported: U8. + */ + void configure(const ITensor *input, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ +}; +} +#endif /* __ARM_COMPUTE_NEBITWISENOTKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h new file mode 100644 index 0000000000..d2bae2660c --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEBitwiseOrKernel.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBITWISEORKERNEL_H__ +#define __ARM_COMPUTE_NEBITWISEORKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform bitwise inclusive OR between two tensors + * + * Result is computed by: + * @f[ output(x,y) = input1(x,y) \lor input2(x,y) @f] + */ +class NEBitwiseOrKernel : public INEKernel +{ +public: + /** Default constructor */ + NEBitwiseOrKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseOrKernel(const NEBitwiseOrKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseOrKernel &operator=(const NEBitwiseOrKernel &) = delete; + /** Allow instances of this class to be moved */ + NEBitwiseOrKernel(NEBitwiseOrKernel &&) = default; + /** Allow instances of this class to be moved */ + NEBitwiseOrKernel &operator=(NEBitwiseOrKernel &&) = default; + /** Initialise the kernel's inputs and output. + * + * @param[in] input1 An input tensor. Data type supported: U8. + * @param[in] input2 An input tensor. Data type supported: U8 + * @param[out] output Output tensor. Data type supported: U8. + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input1; /**< Source tensor 1 */ + const ITensor *_input2; /**< Source tensor 2 */ + ITensor *_output; /**< Destination tensor */ +}; +} +#endif /* __ARM_COMPUTE_NEBITWISEORKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h new file mode 100644 index 0000000000..9dea36e7e3 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEBitwiseXorKernel.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBITWISEXORKERNEL_H__ +#define __ARM_COMPUTE_NEBITWISEXORKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform bitwise exclusive OR (XOR) between two tensors + * + * Result is computed by: + * @f[ output(x,y) = input1(x,y) \oplus input2(x,y) @f] + */ +class NEBitwiseXorKernel : public INEKernel +{ +public: + /** Default constructor */ + NEBitwiseXorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseXorKernel(const NEBitwiseXorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEBitwiseXorKernel &operator=(const NEBitwiseXorKernel &) = delete; + /** Allow instances of this class to be moved */ + NEBitwiseXorKernel(NEBitwiseXorKernel &&) = default; + /** Allow instances of this class to be moved */ + NEBitwiseXorKernel &operator=(NEBitwiseXorKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @param[in] input1 An input tensor. Data type supported: U8. + * @param[in] input2 An input tensor. Data type supported: U8 + * @param[out] output The output tensor. Data type supported: U8. + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input1; /**< Source tensor 1 */ + const ITensor *_input2; /**< Source tensor 2 */ + ITensor *_output; /**< Destination tensor */ +}; +} +#endif /* __ARM_COMPUTE_NEBITWISEXORKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h new file mode 100644 index 0000000000..6b7bebbf17 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEBox3x3Kernel.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEBOX3x3KERNEL_H__ +#define __ARM_COMPUTE_NEBOX3x3KERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform a Box 3x3 filter */ +class NEBox3x3Kernel : public INESimpleKernel +{ +public: + /** Set the source, destination and border mode of the kernel + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data type supported: U8. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; +}; + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/** NEON kernel to perform a Box 3x3 filter using F16 simd + */ +class NEBox3x3FP16Kernel : public NEBox3x3Kernel +{ +public: + // Inherited methods overridden: + void run(const Window &window) override; +}; +#else +using NEBox3x3FP16Kernel = NEBox3x3Kernel; +#endif +} +#endif /*__ARM_COMPUTE_NEBOX3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h new file mode 100644 index 0000000000..b86085f439 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NECannyEdgeKernel.h @@ -0,0 +1,190 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECANNYEDGEKERNEL_H__ +#define __ARM_COMPUTE_NECANNYEDGEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** Computes magnitude and quantised phase from inputs gradients. */ +class NEGradientKernel : public INEKernel +{ +public: + /** Default constructor */ + NEGradientKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGradientKernel(const NEGradientKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGradientKernel &operator=(const NEGradientKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGradientKernel(NEGradientKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGradientKernel &operator=(NEGradientKernel &&) = default; + /** Default destructor */ + virtual ~NEGradientKernel() = default; + + /** Initialise the kernel's sources, destinations and border mode. + * + * @note gx, gy and magnitude must all be the same size (either 16 or 32) + * + * @param[in] gx Source tensor - Gx component. Data type supported: S16/S32. + * @param[in] gy Source tensor - Gy component. Data type supported: same as @p gx. + * @param[out] magnitude Destination tensor - Magnitude. Data type supported: U16 (if the data type of @p gx is S16) / U32 (if the data type of @p gx is S32). + * @param[out] phase Destination tensor - Quantized phase. Data type supported: U8. + * @param[in] norm_type Normalization type. If 1, L1-Norm otherwise L2-Norm + */ + virtual void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type); + + // Inherited methods overridden: + void run(const Window &window) override; + +protected: + /** Common signature for all the specialised gradient functions + * + * @param[in] gx_ptr Pointer to the first input tensor. + * @param[in] gy_ptr Pointer to the second input tensor. + * @param[out] magnitude_ptr Pointer to the first output tensor + * @param[out] phase_ptr Pointer to the second output tensor + */ + using GradientFunction = void(const void *__restrict gx_ptr, const void *__restrict gy_ptr, void *__restrict magnitude_ptr, void *__restrict phase_ptr); + + GradientFunction *_func; /**< Gradient function to use for the particular tensor types passed to configure() */ + const ITensor *_gx; /**< Source tensor - Gx component */ + const ITensor *_gy; /**< Source tensor - Gy component */ + ITensor *_magnitude; /**< Destination tensor - Magnitude */ + ITensor *_phase; /**< Destination tensor - Quantized phase */ +}; + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/** NEON kernel to perform Gradient computation + */ +class NEGradientFP16Kernel : public NEGradientKernel +{ +public: + // Inherited methods overriden: + void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase, int32_t norm_type) override; +}; +#else /* ARM_COMPUTE_ENABLE_FP16 */ +using NEGradientFP16Kernel = NEGradientKernel; +#endif /* ARM_COMPUTE_ENABLE_FP16 */ + +/** NEON kernel to perform Non-Maxima suppression for Canny Edge. + * + * @note This kernel is meant to be used alongside CannyEdge and performs a non-maxima suppression using magnitude and phase of input + * to characterize points as possible edges. Thus, at the end, each point will be set to EDGE, NO_EDGE or MAYBE. + * + * @note Hysteresis is computed in @ref NEEdgeTraceKernel + */ +class NEEdgeNonMaxSuppressionKernel : public INEKernel +{ +public: + /** Default constructor */ + NEEdgeNonMaxSuppressionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEEdgeNonMaxSuppressionKernel(const NEEdgeNonMaxSuppressionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEEdgeNonMaxSuppressionKernel &operator=(const NEEdgeNonMaxSuppressionKernel &) = delete; + /** Allow instances of this class to be moved */ + NEEdgeNonMaxSuppressionKernel(NEEdgeNonMaxSuppressionKernel &&) = default; + /** Allow instances of this class to be moved */ + NEEdgeNonMaxSuppressionKernel &operator=(NEEdgeNonMaxSuppressionKernel &&) = default; + /** Default destructor */ + ~NEEdgeNonMaxSuppressionKernel() = default; + + /** Initialise the kernel's sources, destination and border mode. + * + * @param[in] magnitude Source tensor - Magnitude. Data type supported: U16/U32. + * @param[in] phase Source tensor - Quantized phase. Data type supported: U8. + * @param[out] output Output tensor. Data type supported: U8. It will be filled with 0 for "no edge", 127 for "maybe", 255 for "edge" + * @param[in] upper_thr Upper threshold used for the hysteresis + * @param[in] lower_thr Lower threshold used for the hysteresis + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *magnitude, const ITensor *phase, ITensor *output, int32_t upper_thr, int32_t lower_thr, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Common signature for all the specialised non-maxima suppression functions + * + * @param[in] magnitude_ptr Pointer to the first input tensor. + * @param[in] phase_ptr Pointer to the second input tensor. + * @param[out] output_ptr Pointer to the output tensor + * @param[in] stride_mag Stride of the magnitude tensor + * @param[in] upper_thr Upper threshold used for the hysteresis + * @param[in] lower_thr Lower threshold used for the hysteresis + */ + using EdgeNonMaxSupprFunction = void(const void *__restrict magnitude_ptr, const void *__restrict phase_ptr, void *__restrict output_ptr, const uint32_t stride_mag, const int32_t upper_thr, + const int32_t lower_thr); + + EdgeNonMaxSupprFunction *_func; /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */ + const ITensor *_magnitude; /**< Source tensor - Magnitude */ + const ITensor *_phase; /**< Source tensor - Quantized phase */ + ITensor *_output; /**< Destination tensor */ + int32_t _lower_thr; /**< Lower threshold used for the hysteresis */ + int32_t _upper_thr; /**< Upper threshold used for the hysteresis */ +}; + +/** NEON kernel to perform Edge tracing */ +class NEEdgeTraceKernel : public INEKernel +{ +public: + /** Default constructor */ + NEEdgeTraceKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEEdgeTraceKernel(const NEEdgeTraceKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEEdgeTraceKernel &operator=(const NEEdgeTraceKernel &) = delete; + /** Allow instances of this class to be moved */ + NEEdgeTraceKernel(NEEdgeTraceKernel &&) = default; + /** Allow instances of this class to be moved */ + NEEdgeTraceKernel &operator=(NEEdgeTraceKernel &&) = default; + /** Default constructor */ + ~NEEdgeTraceKernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @param[in,out] input Source tensor. Data type supported: U8. Must contain 0 for "no edge", 127 for "maybe", 255 for "edge" + * @param[in,out] output Destination tensor. Data type supported: U8. Must be initialized to 0 (No edge). + */ + void configure(ITensor *input, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + bool is_parallelisable() const override; + +private: + ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ +}; +} +#endif /* __ARM_COMPUTE_NECANNYEDGEKERNEL_H */ diff --git a/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h new file mode 100644 index 0000000000..8b669a4d28 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEChannelCombineKernel.h @@ -0,0 +1,125 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__ +#define __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +#include +#include + +namespace arm_compute +{ +class IMultiImage; +class ITensor; +using IImage = ITensor; + +/** Interface for the channel combine kernel */ +class NEChannelCombineKernel : public INEKernel +{ +public: + /** Default constructor */ + NEChannelCombineKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEChannelCombineKernel(const NEChannelCombineKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEChannelCombineKernel &operator=(const NEChannelCombineKernel &) = delete; + /** Allow instances of this class to be moved */ + NEChannelCombineKernel(NEChannelCombineKernel &&) = default; + /** Allow instances of this class to be moved */ + NEChannelCombineKernel &operator=(NEChannelCombineKernel &&) = default; + /** Default destructor */ + ~NEChannelCombineKernel() = default; + + /** Configure function's inputs and outputs. + * + * @param[in] plane0 The 2D plane that forms channel 0. Data type supported: U8 + * @param[in] plane1 The 2D plane that forms channel 1. Data type supported: U8 + * @param[in] plane2 The 2D plane that forms channel 2. Data type supported: U8 + * @param[in] plane3 The 2D plane that forms channel 3. Data type supported: U8 + * @param[out] output The single planar output tensor. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 + */ + void configure(const ITensor *plane0, const ITensor *plane1, const ITensor *plane2, const ITensor *plane3, ITensor *output); + /** Configure function's inputs and outputs. + * + * @param[in] plane0 The 2D plane that forms channel 0. Data type supported: U8 + * @param[in] plane1 The 2D plane that forms channel 1. Data type supported: U8 + * @param[in] plane2 The 2D plane that forms channel 2. Data type supported: U8 + * @param[out] output The multi planar output tensor. Formats supported: NV12/NV21/IYUV/YUV444 + */ + void configure(const IImage *plane0, const IImage *plane1, const IImage *plane2, IMultiImage *output); + + // Inherited methods overridden: + void run(const Window &window) override; + bool is_parallelisable() const override; + +private: + /** Combine 3 planes to form a three channel single plane tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void combine_3C(const Window &win); + /** Combine 4 planes to form a four channel single plane tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void combine_4C(const Window &win); + /** Combine 3 planes to form a single plane YUV tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + template + void combine_YUV_1p(const Window &win); + /** Combine 3 planes to form a two plane YUV tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void combine_YUV_2p(const Window &win); + /** Combine 3 planes to form a three plane YUV tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void combine_YUV_3p(const Window &win); + /** Copies a full plane to the output tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void copy_plane(const Window &win, uint32_t plane_id); + /** Common signature for all the specialised ChannelCombine functions + * + * @param[in] window Region on which to execute the kernel. + */ + using ChannelCombineFunction = void (NEChannelCombineKernel::*)(const Window &window); + /** ChannelCombine function to use for the particular tensor types passed to configure() */ + ChannelCombineFunction _func; + std::array _planes; + ITensor *_output; + IMultiImage *_output_multi; + std::array _x_subsampling; + std::array _y_subsampling; + unsigned int _num_elems_processed_per_iteration; + bool _is_parallelizable; +}; +} +#endif /* __ARM_COMPUTE_NECHANNELCOMBINEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h new file mode 100644 index 0000000000..0715e1f8cb --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEChannelExtractKernel.h @@ -0,0 +1,109 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__ +#define __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" +#include "arm_compute/core/Types.h" + +#include + +namespace arm_compute +{ +class IMultiImage; +class ITensor; +using IImage = ITensor; + +/** Interface for the channel extract kernel */ +class NEChannelExtractKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEChannelExtractKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEChannelExtractKernel(const NEChannelExtractKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEChannelExtractKernel &operator=(const NEChannelExtractKernel &) = delete; + /** Allow instances of this class to be moved */ + NEChannelExtractKernel(NEChannelExtractKernel &&) = default; + /** Allow instances of this class to be moved */ + NEChannelExtractKernel &operator=(NEChannelExtractKernel &&) = default; + /** Default destructor */ + ~NEChannelExtractKernel() = default; + + /** Set the input and output of the kernel + * + * @param[in] input Source tensor. Formats supported: RGB888/RGBA8888/YUYV422/UYVY422 + * @param[in] channel Channel to extract. + * @param[out] output Destination tensor. Format supported: u8 + */ + void configure(const ITensor *input, Channel channel, ITensor *output); + /** Set the input and output of the kernel + * + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV/YUV444 + * @param[in] channel Channel to extract. + * @param[out] output Single-planar destination image. Format supported: U8 + */ + void configure(const IMultiImage *input, Channel channel, IImage *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Extract one channel from a two channel planar tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void extract_1C_from_2C_img(const Window &win); + /** Extract one channel from a three channel planar tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void extract_1C_from_3C_img(const Window &win); + /** Extract one channel from a four channel planar tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void extract_1C_from_4C_img(const Window &win); + /** Extract U/V channel from a single planar YUVY/UYVY tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void extract_YUYV_uv(const Window &win); + /** Copies a full plane to the output tensor. + * + * @param[in] win Region on which to execute the kernel. + */ + void copy_plane(const Window &win); + /** Common signature for all the specialised ChannelExtract functions + * + * @param[in] window Region on which to execute the kernel. + */ + using ChannelExtractFunction = void (NEChannelExtractKernel::*)(const Window &window); + /** ChannelExtract function to use for the particular tensor types passed to configure() */ + ChannelExtractFunction _func; + unsigned int _lut_index; +}; +} +#endif /* __ARM_COMPUTE_NECHANNELEXTRACTKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NECol2ImKernel.h b/arm_compute/core/NEON/kernels/NECol2ImKernel.h new file mode 100644 index 0000000000..f6bc2152da --- /dev/null +++ b/arm_compute/core/NEON/kernels/NECol2ImKernel.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECOL2IMKERNEL_H__ +#define __ARM_COMPUTE_NECOL2IMKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform col2im reshaping. + * + * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel. + * + * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3: + * + * @f[ + * \left( \begin{array}{ccccccccc} + * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccc} + * a0 & a1 & a2 \\ + * a3 & a4 & a5 \\ + * a6 & a7 & a8 \\ + * \end{array} \right) + * @f] + */ +class NECol2ImKernel : public INEKernel +{ +public: + /** Default constructor */ + NECol2ImKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECol2ImKernel(const NECol2ImKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECol2ImKernel &operator=(const NECol2ImKernel &) = delete; + /** Allow instances of this class to be moved */ + NECol2ImKernel(NECol2ImKernel &&) = default; + /** Allow instances of this class to be moved */ + NECol2ImKernel &operator=(NECol2ImKernel &&) = default; + /** Default destructor */ + ~NECol2ImKernel() = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. Data types supported: U8/S8/QS8/U16/S16/QS16/F16/U32/S32/F32 + * @param[out] output The output tensor. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input + * @param[in] convolved_dims Output convolved dimensions. + */ + void configure(const ITensor *input, ITensor *output, std::pair convolved_dims); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Template function to run the col2im + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template + void run_col2im(const Window &window); + + /** Common signature for all the specialised col2im functions + * + * @param[in] window Region on which to execute the kernel. + */ + using Col2ImFunctionPtr = void (NECol2ImKernel::*)(const Window &window); + + Col2ImFunctionPtr _func; + const ITensor *_input; + ITensor *_output; + std::pair _convolved_dims; +}; +} + +#endif /*__ARM_COMPUTE_NECOL2IMKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEColorConvertKernel.h b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h new file mode 100644 index 0000000000..2297218117 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEColorConvertKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_COLORCONVERTKERNEL_H__ +#define __ARM_COMPUTE_COLORCONVERTKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class IMultiImage; +class ITensor; +using IImage = ITensor; + +/** Interface for the color convert kernel */ +class NEColorConvertKernel : public INEKernel +{ +public: + /** Default constructor */ + NEColorConvertKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEColorConvertKernel(const NEColorConvertKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEColorConvertKernel &operator=(const NEColorConvertKernel &) = delete; + /** Allow instances of this class to be moved */ + NEColorConvertKernel(NEColorConvertKernel &&) = default; + /** Allow instances of this class to be moved */ + NEColorConvertKernel &operator=(NEColorConvertKernel &&) = default; + /** Default destructor */ + ~NEColorConvertKernel() = default; + + /** Set the input and output of the kernel + * + * @param[in] input Source tensor. Formats supported: RGBA8888/UYVY422/YUYV422/RGB888 + * @param[out] output Destination tensor. Formats supported: RGB888 (if the formats of @p input are RGBA8888/UYVY422/YUYV422), + * RGBA8888 (if the formats of @p input are UYVY422/YUYV422/RGB888/) + */ + void configure(const ITensor *input, ITensor *output); + /** Set the input and output of the kernel + * + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Single-planar destination image. Formats supported: RGB888/RGBA8888 + */ + void configure(const IMultiImage *input, IImage *output); + /** Set the input and output of the kernel + * + * @param[in] input Single-planar source image. Formats supported: RGB888/RGBA8888/UYVY422/YUYV422 + * @param[out] output Multi-planar destination image. Formats supported: NV12/IYUV/YUV444 (if the formats of @p input are RGB888/RGB8888) + */ + void configure(const IImage *input, IMultiImage *output); + /** Set the input and output of the kernel + * + * @param[in] input Multi-planar source image. Formats supported: NV12/NV21/IYUV + * @param[out] output Multi-planar destination image. Formats supported: YUV444/IYUV (if the formats of @p input are NV12/NV21)/NV12 (if the format of @p input is IYUV) + */ + void configure(const IMultiImage *input, IMultiImage *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + using ColorConvertFunction = void(const void *__restrict input_ptr, void *__restrict output_ptr, const Window &win); + const void *_input; + void *_output; + ColorConvertFunction *_func; +}; +} +#endif /*__ARM_COMPUTE_NECOLORCONVERTKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEConvolutionKernel.h b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h new file mode 100644 index 0000000000..588a228a5d --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEConvolutionKernel.h @@ -0,0 +1,251 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__ +#define __ARM_COMPUTE_NECONVOLUTIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/INESimpleKernel.h" + +#include +#include +#include + +namespace arm_compute +{ +class ITensor; + +/****************************************************************************************\ + * Square Convolution * +\****************************************************************************************/ + +/** Interface for the kernel to run an arbitrary size convolution on a tensor. (Currently supports 3x3, 5x5, 7x7 and 9x9). + * The client can supply a convolution matrix \f$ C_{m,n} \f$. + * @f{eqnarray}{ + * k_0 &=& \frac{m}{2} \\ + * l_0 &=& \frac{n}{2} \\ + * sum &=& \sum_{k=0,l=0}^{k=m-1,l=n-1} input(x+k-k_0, y+l-l_0) C_{k,l} + * @f} + * + * @note The above equation for this function is similar to the default OpenCV Filter2D function, + * which actually computes a correlation and not a convolution. + * In case of a real convolution the convolution matrix should be flipped both horizontally and vertically. + */ +template +class NEConvolutionKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEConvolutionKernel(); + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data types supported: U8, S16. + * @param[in] conv Convolution matrix to apply to the input tensor. + * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t scale, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + template + void convolution(const Window &win); + +protected: + uint32_t _scale; /**< scale of the convolution */ + std::array _convolution; /**< convolution matrix */ +}; + +/** Interface for the kernel which applied a 3x3 convolution to a tensor.*/ +using NEConvolution3x3Kernel = NEConvolutionKernel<3>; +/** Interface for the kernel which applied a 5x5 convolution to a tensor.*/ +using NEConvolution5x5Kernel = NEConvolutionKernel<5>; +/** Interface for the kernel which applied a 7x7 convolution to a tensor.*/ +using NEConvolution7x7Kernel = NEConvolutionKernel<7>; +///** Interface for the kernel which applied a 9x9 convolution to a tensor.*/ +using NEConvolution9x9Kernel = NEConvolutionKernel<9>; + +/****************************************************************************************\ + * Separable Square Convolution * +\****************************************************************************************/ + +/** Kernel for the Horizontal pass of a Separable Convolution */ +template +class NESeparableConvolutionHorKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NESeparableConvolutionHorKernel(); + + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data types supported: U16, S16, S32. + * @param[in] conv_row Convolution matrix to apply to the input tensor. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, const int16_t *conv_row, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Apply the object's convolution to the given window of the input tensor.. + * + * @param[in] window Window to apply the convolution on. + */ + template + void convolve(const Window &window); + + std::array _conv_row; /**< Convolution coefficients */ + BorderSize _border_size; /**< Border size */ +}; + +/** Interface for the kernel which applied a 5x1 horizontal convolution to a tensor.*/ +using NESeparableConvolution5x5HorKernel = NESeparableConvolutionHorKernel<5>; +/** Interface for the kernel which applied a 7x1 horizontal convolution to a tensor.*/ +using NESeparableConvolution7x7HorKernel = NESeparableConvolutionHorKernel<7>; +/** Interface for the kernel which applied a 9x1 horizontal convolution to a tensor.*/ +using NESeparableConvolution9x9HorKernel = NESeparableConvolutionHorKernel<9>; + +/** Kernel for the Vertical pass of a Separable Convolution */ +template +class NESeparableConvolutionVertKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NESeparableConvolutionVertKernel(); + + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input Source tensor. Data type supported: U16, S16, S32. + * @param[out] output Destination tensor, Data types supported: U8, S16. + * @param[in] conv_col Convolution matrix to apply to the input tensor. + * @param[in] scale Scale of the convolution matrix + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, const int16_t *conv_col, uint32_t scale, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Apply the object's convolution to the given window of the input tensor. + * This function is used if the intermediate values have been stored as U16. + * + * @param[in] win Window to apply the convolution on. + */ + template + void convolution_u16(const Window &win); + /** Apply the object's convolution to the given window of the input tensor. + * This function is used if the intermediate values have been stored as S16. + * + * @param[in] win Window to apply the convolution on. + */ + template + void convolution_s16(const Window &win); + /** Apply the object's convolution to the given window of the input tensor. + * This function is used if the intermediate values have been stored as S32. + * + * @param[in] win Window to apply the convolution on. + */ + template + void convolution_s32(const Window &win); + + std::array _conv_col; /**< Convolution coefficients */ + uint32_t _scale; /**< Convolution's scale */ +}; + +/** Interface for the kernel which applied a 1x5 vertical convolution to a tensor.*/ +using NESeparableConvolution5x5VertKernel = NESeparableConvolutionVertKernel<5>; +/** Interface for the kernel which applied a 1x7 vertical convolution to a tensor.*/ +using NESeparableConvolution7x7VertKernel = NESeparableConvolutionVertKernel<7>; +/** Interface for the kernel which applied a 1x9 vertical convolution to a tensor.*/ +using NESeparableConvolution9x9VertKernel = NESeparableConvolutionVertKernel<9>; + +/****************************************************************************************\ + * Rectangle Convolution * +\****************************************************************************************/ + +/** Kernel for the running convolution on a rectangle matrix. + * + * @note Supports combinations of 3,5,7 and 9. + */ +class NEConvolutionRectangleKernel : public INEKernel +{ +public: + /** Default constructor */ + NEConvolutionRectangleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &) = delete; + /** Allow instances of this class to be moved */ + NEConvolutionRectangleKernel(NEConvolutionRectangleKernel &&) = default; + /** Allow instances of this class to be moved */ + NEConvolutionRectangleKernel &operator=(NEConvolutionRectangleKernel &&) = default; + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor, Data types supported: U8, S16. + * @param[in] conv Convolution matrix to apply to the input tensor. + * @param[in] width Width of convolution matrix (Number of columns) + * @param[in] height Height of convolution matrix (Number of rows) + * @param[in] scale Scale of the convolution matrix. If 0 is passed, it will be set to the sum of the coefficients of the convolution or 1 if they add up to 0. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, const int16_t *conv, uint32_t width, uint32_t height, uint32_t scale, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + unsigned int get_index(uint32_t val); + /** Apply the object's convolution to the given window of the input tensor. + * + * @param[in] win Window to apply the convolution on. + */ + template + void convolution(const Window &win); + +protected: + const ITensor *_input; /**< Input tensor */ + ITensor *_output; /**< Output tensor */ + uint32_t _scale; /**< Scale of the convolution */ + std::vector _convolution; /**< Convolution matrix */ + BorderSize _border_size; /**< Calculated border width */ + uint32_t _func_idx; /**< Index used to specify convolution function to be used */ + const static unsigned int _nr_supported_sizes + { + 4 + }; /**< Number of supported permutations */ +}; +} +#endif /*__ARM_COMPUTE_NECONVOLUTIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h new file mode 100644 index 0000000000..67b8c6052d --- /dev/null +++ b/arm_compute/core/NEON/kernels/NECumulativeDistributionKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__ +#define __ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +#include + +namespace arm_compute +{ +class IDistribution1D; +class ILut; +class ITensor; +using IImage = ITensor; + +/** Interface for the cumulative distribution (cummulative summmation) calculation kernel. + * + * This kernel calculates the cumulative sum of a given distribution (meaning that each output element + * is the sum of all its previous elements including itself) and creates a lookup table with the normalized + * pixel intensities which is used for improve the constrast of the image. + */ +class NECumulativeDistributionKernel : public INEKernel +{ +public: + /** Default constructor */ + NECumulativeDistributionKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECumulativeDistributionKernel(const NECumulativeDistributionKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NECumulativeDistributionKernel &operator=(const NECumulativeDistributionKernel &) = delete; + /** Allow instances of this class to be moved */ + NECumulativeDistributionKernel(NECumulativeDistributionKernel &&) = default; + /** Allow instances of this class to be moved */ + NECumulativeDistributionKernel &operator=(NECumulativeDistributionKernel &&) = default; + /** Set the input and output distribution. + * + * @param[in] input Input image. Data type supported: U8 + * @param[in] distribution Unnormalized 256-bin distribution of the input image. + * @param[out] cumulative_sum Cummulative distribution (Summed histogram). Should be same size as @p distribution. + * @param[out] output Equalization lookup table. Should consist of 256 entries of U8 elements. + */ + void configure(const IImage *input, const IDistribution1D *distribution, IDistribution1D *cumulative_sum, ILut *output); + + // Inherited methods overridden: + void run(const Window &window) override; + bool is_parallelisable() const override; + +private: + const IImage *_input; /**< Input image. */ + const IDistribution1D *_distribution; /**< Input histogram of the input image. */ + IDistribution1D *_cumulative_sum; /**< The cummulative distribution. */ + ILut *_output; /**< Output with the equalization lookup table. */ +private: + static const uint32_t _histogram_size = 256; /**< Default histogram size of 256. */ +}; +} + +#endif /*__ARM_COMPUTE_NECUMULATIVEDISTRIBUTIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h new file mode 100644 index 0000000000..7384cd1f02 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDepthConcatenateKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ +#define __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the depth concatenate kernel. + * The input tensor will be concatenated into the output tensor. + */ +class NEDepthConcatenateKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDepthConcatenateKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthConcatenateKernel(const NEDepthConcatenateKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDepthConcatenateKernel &operator=(const NEDepthConcatenateKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDepthConcatenateKernel(NEDepthConcatenateKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDepthConcatenateKernel &operator=(NEDepthConcatenateKernel &&) = default; + /** Default destructor */ + ~NEDepthConcatenateKernel() = default; + /** Initialise the kernel's inputs and output + * + * @param[in] input Input tensor. Data types supported: F32. + * @param[in] depth_offset The offset on the Z axis. + * @param[in,out] output Output tensor. Data types supported: F32. + * + * @note: The output tensor's low two dimensions can't be smaller than the input one's. + * @note: The gaps between the two lowest dimensions of input and output need to be divisible by 2. + * + */ + void configure(const ITensor *input, unsigned int depth_offset, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const ITensor *_input; + ITensor *_output; + int _top_bottom; + int _left_right; + unsigned int _depth_offset; +}; +} +#endif /* __ARM_COMPUTE_NEDEPTHCONCATENATEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h new file mode 100644 index 0000000000..0c5c29e4db --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDepthConvertKernel.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__ +#define __ARM_COMPUTE_DEPTHCONVERTKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" +#include "arm_compute/core/Types.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** Depth conversion kernel */ +class NEDepthConvertKernel : public INESimpleKernel +{ +public: + /** Default constructor*/ + NEDepthConvertKernel(); + /** Set the input and output of the kernel + * + * Valid conversions Input -> Output : + * + * - QS8 -> F32 + * - U8 -> U16, S16, S32 + * - U16 -> U8, U32 + * - S16 -> U8, S32 + * - F32 -> QS8 + * + * + * @param[in] input The input tensor to convert. Data types supported: U8/QS8/U16/S16/F32. + * @param[out] output The output tensor. Data types supported: U8/QS8/U16/S16/U32/S32/F32. + * @param[in] policy Conversion policy. + * @param[in] shift Value for down/up conversions. Must be 0 <= shift < 8. + */ + void configure(const ITensor *input, ITensor *output, ConvertPolicy policy, uint32_t shift); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + ConvertPolicy _policy; + uint32_t _shift; +}; +} +#endif /*__ARM_COMPUTE_NEDEPTHCONVERTKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDerivativeKernel.h b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h new file mode 100644 index 0000000000..abb8a894c0 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDerivativeKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__ +#define __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to run the derivative along the X/Y directions on a tensor. + * + */ +class NEDerivativeKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDerivativeKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDerivativeKernel(const NEDerivativeKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDerivativeKernel &operator=(const NEDerivativeKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDerivativeKernel(NEDerivativeKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDerivativeKernel &operator=(NEDerivativeKernel &&) = default; + /** Initialise the kernel's sources, destination and border + * + * @note At least one of output_x or output_y must be set + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output_x (Optional) Destination tensor for the X gradient. Data type supported: S16. + * @param[out] output_y (Optional) Destination tensor for the Y gradient. Data type supported: S16. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Function to perform derivative along the X direction on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void derivative_x(const Window &window); + /** Function to perform derivative along the Y direction on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void derivative_y(const Window &window); + /** Function to perform derivative along the X and Y direction on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void derivative_xy(const Window &window); + /** Common signature for all the specialised derivative functions + * + * @param[in] window Region on which to execute the kernel. + */ + using DerivativeFunction = void (NEDerivativeKernel::*)(const Window &window); + /** Derivative function to use for the particular tensor types passed to configure() */ + DerivativeFunction _func; + +private: + const ITensor *_input; /**< Input tensor */ + ITensor *_output_x; /**< Output tensor - Derivate along the X direction */ + ITensor *_output_y; /**< Output tensor - Derivate along the Y direction */ +}; +} +#endif /* __ARM_COMPUTE_NEDERIVATIVEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDilateKernel.h b/arm_compute/core/NEON/kernels/NEDilateKernel.h new file mode 100644 index 0000000000..05f148a1fd --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDilateKernel.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDILATEKERNEL_H__ +#define __ARM_COMPUTE_NEDILATEKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform boolean image dilatation */ +class NEDilateKernel : public INESimpleKernel +{ +public: + /** Set the source, destination and border mode of the kernel + * + * @param[in] input Source tensor. Data type supported: U8 + * @param[out] output Destination tensor. Data type supported: U8 + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; +}; +} +#endif /*__ARM_COMPUTE_NEDILATEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h new file mode 100644 index 0000000000..f098e18655 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerBiasAccumulateKernel.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ +#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; +/** NEON kernel to accumulate the biases to each element of the input tensor + * + * @note We assume bias to be shared + */ +class NEDirectConvolutionLayerBiasAccumulateKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDirectConvolutionLayerBiasAccumulateKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDirectConvolutionLayerBiasAccumulateKernel(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDirectConvolutionLayerBiasAccumulateKernel &operator=(const NEDirectConvolutionLayerBiasAccumulateKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDirectConvolutionLayerBiasAccumulateKernel(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDirectConvolutionLayerBiasAccumulateKernel &operator=(NEDirectConvolutionLayerBiasAccumulateKernel &&) = default; + /** Default destructor */ + ~NEDirectConvolutionLayerBiasAccumulateKernel() = default; + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] input Input to add the bias to. If @p output is not specified then accumulation is done in-place. + * Data type supported: QS8/F32 + * @param[in] bias The shared bias tensor to add. It must be 1D Tensor. Data type supported: Same as @p input + * @param[out] output (Optional) If the output tensor is specified the accumulation is done out-of-place. (Defaults to nullptr) + * Data type supported: Same as @p input + */ + void configure(ITensor *input, const ITensor *bias, ITensor *output = nullptr); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + using BiasAccumulateKernel = void(ITensor *input, const ITensor *bias, const Window window, ITensor *output); + +private: + BiasAccumulateKernel *_func; + ITensor *_input; + const ITensor *_bias; + ITensor *_output; +}; +} +#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERBIASACCUMULATEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h new file mode 100644 index 0000000000..d726071606 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEDirectConvolutionLayerKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON interface for Direct Convolution Layer kernel */ +class NEDirectConvolutionLayerKernel : public INEKernel +{ +public: + /** Default constructor */ + NEDirectConvolutionLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDirectConvolutionLayerKernel(const NEDirectConvolutionLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEDirectConvolutionLayerKernel &operator=(const NEDirectConvolutionLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + NEDirectConvolutionLayerKernel(NEDirectConvolutionLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + NEDirectConvolutionLayerKernel &operator=(NEDirectConvolutionLayerKernel &&) = default; + /** Default destructor */ + ~NEDirectConvolutionLayerKernel() = default; + /** Set the input, weights and output tensors. + * + * @param[in] input Input tensor. Data types supported: QS8/F32. + * @param[in] weights Set of kernels to convolve the input volume. + * The 3rd dimension must be the same as the input's volume 3rd dimension. + * Data type supported: Same as @p input. + * @param[out] output Output tensor. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p input. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + */ + void configure(const ITensor *input, const ITensor *weights, ITensor *output, const PadStrideInfo &conv_info); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const ITensor *_input; + const ITensor *_weights; + ITensor *_output; + PadStrideInfo _conv_info; + BorderSize _border_size; + unsigned int _kernel_size; + unsigned int _num_elems_read_per_iteration; + unsigned int _num_elems_written_per_iteration; +}; +} +#endif /*__ARM_COMPUTE_NEDIRECTCONVOLUTIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEErodeKernel.h b/arm_compute/core/NEON/kernels/NEErodeKernel.h new file mode 100644 index 0000000000..86dc217cc0 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEErodeKernel.h @@ -0,0 +1,49 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEERODEKERNEL_H__ +#define __ARM_COMPUTE_NEERODEKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform boolean image erosion */ +class NEErodeKernel : public INESimpleKernel +{ +public: + /** Set the source, destination and border mode of the kernel + * + * @param[in] input Source tensor. Data type supported: U8 + * @param[out] output Destination tensor. Data type supported: U8 + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; +}; +} +#endif /*__ARM_COMPUTE_NEERODEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEFastCornersKernel.h b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h new file mode 100644 index 0000000000..d9bd6acde9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEFastCornersKernel.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__ +#define __ARM_COMPUTE_NEFASTCORNERSKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +#include + +namespace arm_compute +{ +class ITensor; +using IImage = ITensor; + +/** NEON kernel to perform fast corners */ +class NEFastCornersKernel : public INEKernel +{ +public: + /** Constructor */ + NEFastCornersKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFastCornersKernel(const NEFastCornersKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFastCornersKernel &operator=(const NEFastCornersKernel &) = delete; + /** Allow instances of this class to be moved */ + NEFastCornersKernel(NEFastCornersKernel &&) = default; + /** Allow instances of this class to be moved */ + NEFastCornersKernel &operator=(NEFastCornersKernel &&) = default; + /** Initialise the kernel. + * + * @param[in] input Source image. Data type supported: U8. + * @param[out] output Output image. Data type supported: U8. + * @param[in] threshold Threshold on difference between intensity of the central pixel and pixels on Bresenham's circle of radius 3. + * @param[in] non_max_suppression True if non-maxima suppresion is applied, false otherwise. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const IImage *input, IImage *output, uint8_t threshold, bool non_max_suppression, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const IImage *_input; /**< source image */ + IImage *_output; /**< inermediate results */ + uint8_t _threshold; /**< threshold on difference between intensity */ + bool _non_max_suppression; /** true if non-maxima suppression is applied in the next stage */ +}; +} +#endif diff --git a/arm_compute/core/NEON/kernels/NEFillArrayKernel.h b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h new file mode 100644 index 0000000000..8e0846ea88 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEFillArrayKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFILLARRAYKERNEL_H__ +#define __ARM_COMPUTE_NEFILLARRAYKERNEL_H__ + +#include "arm_compute/core/IArray.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +#include + +namespace arm_compute +{ +class ITensor; +using IImage = ITensor; + +/** This kernel adds all texels greater than or equal to the threshold value to the keypoint array. */ +class NEFillArrayKernel : public INEKernel +{ +public: + /** Default contructor */ + NEFillArrayKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFillArrayKernel(const NEFillArrayKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFillArrayKernel &operator=(const NEFillArrayKernel &) = delete; + /** Allow instances of this class to be moved */ + NEFillArrayKernel(NEFillArrayKernel &&) = default; + /** Allow instances of this class to be moved */ + NEFillArrayKernel &operator=(NEFillArrayKernel &&) = default; + /** Default detructor */ + ~NEFillArrayKernel() = default; + + /** Initialise the kernel. + * + * @param[in] input Source image. Data type supported: U8. + * @param[in] threshold Texels greater than the threshold will be added to the array. + * @param[out] output Arrays of keypoints to store the results. + */ + void configure(const IImage *input, uint8_t threshold, IKeyPointArray *output); + + // Inherited methods overridden: + void run(const Window &window) override; + bool is_parallelisable() const override; + +private: + const IImage *_input; + IKeyPointArray *_output; + uint8_t _threshold; +}; +} +#endif diff --git a/arm_compute/core/NEON/kernels/NEFillBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h new file mode 100644 index 0000000000..3ec66115e2 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEFillBorderKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFILLBORDERKERNEL_H__ +#define __ARM_COMPUTE_NEFILLBORDERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to fill borders */ +class NEFillBorderKernel : public INEKernel +{ +public: + /** Default Constructor */ + NEFillBorderKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFillBorderKernel(const NEFillBorderKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFillBorderKernel &operator=(const NEFillBorderKernel &) = delete; + /** Allow instances of this class to be moved */ + NEFillBorderKernel(NEFillBorderKernel &&) = default; + /** Allow instances of this class to be moved */ + NEFillBorderKernel &operator=(NEFillBorderKernel &&) = default; + /** Default destructor */ + ~NEFillBorderKernel() = default; + + /** Initialise the function. + * + * @note This kernel fills the borders within the XY-planes. + * + * @param[in,out] tensor Tensor to process. Data types supported: U8/S8/QS8/QS16/S16/S32/F32. + * @param[in] border_size Size of the border to fill in elements. + * @param[in] border_mode Border mode to use for the convolution. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * + */ + void configure(ITensor *tensor, BorderSize border_size, BorderMode border_mode, const PixelValue &constant_border_value = PixelValue()); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + template + void fill_replicate_single_channel(const Window &window); + template + void fill_constant_value_single_channel(const Window &window); + + ITensor *_tensor; + BorderSize _border_size; + BorderMode _mode; + PixelValue _constant_border_value; +}; +} +#endif /*__ARM_COMPUTE_NEFILLBORDERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h new file mode 100644 index 0000000000..61e6e46463 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEFillInnerBorderKernel.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__ +#define __ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to fill the interior borders */ +class NEFillInnerBorderKernel : public INEKernel +{ +public: + /** Default constructor */ + NEFillInnerBorderKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFillInnerBorderKernel(const NEFillInnerBorderKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEFillInnerBorderKernel &operator=(const NEFillInnerBorderKernel &) = delete; + /** Allow instances of this class to be moved */ + NEFillInnerBorderKernel(NEFillInnerBorderKernel &&) = default; + /** Allow instances of this class to be moved */ + NEFillInnerBorderKernel &operator=(NEFillInnerBorderKernel &&) = default; + /** Default destructor */ + ~NEFillInnerBorderKernel() = default; + + /** Initialise the function. + * + * @note This kernel fills the borders within the XY-planes. + * + * @param[in,out] input Tensor to process. Data types supported: U8/QS8/S16/S32/F32. + * @param[in] border_size Size of the border to fill in elements. + * @param[in] constant_border_value (Optional) Constant value to use for borders if border_mode is set to CONSTANT. + * + */ + void configure(ITensor *input, BorderSize border_size, const PixelValue &constant_border_value = PixelValue()); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + template + void fill_value_single_channel(const Window &window); + + ITensor *_tensor; + BorderSize _border_size; + PixelValue _constant_border_value; +}; +} +#endif /*__ARM_COMPUTE_NEFILLINNERBORDERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h new file mode 100644 index 0000000000..b9884ffb57 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMInterleave4x4Kernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__ +#define __ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to interleave the elements of a matrix + * + * This function puts the values in a 4x4 block of Matrix A on the same row (Interleaved values) + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a10 & a20 & a30 & a01 & a11 & a21 & a31 & a02 & a12 & a22 & a32 & a03 & a13 & a23 & a33 \\ + * \end{array} \right) + * @f] + * + * After this operation, the output matrix will have the following shape: [ height * 4, ceil(width / 4.0f) ] + */ +class NEGEMMInterleave4x4Kernel : public INESimpleKernel +{ +public: + /* Constructor */ + NEGEMMInterleave4x4Kernel(); + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor which stores the interleaved matrix. Data type supported: same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the transpose functions + * + * @param[in] input An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[out] output The output tensor. Data type supported: same as @p input + * @param[in] window Region on which to execute the kernel. + */ + using GEMMInterleaveFunction = void(const ITensor *input, ITensor *output, const Window &window); + + GEMMInterleaveFunction *_func; /**< GEMM interleave function to use for the particular tensor types passed to configure() */ +}; +} +#endif /*__ARM_COMPUTE_NEGEMMINTERLEAVE4x4KERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h new file mode 100644 index 0000000000..ba4dcc3373 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMLowpMatrixMultiplyKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to multiply matrices + * + * @note @ref NEGEMMLowpMatrixMultiplyKernel low precision matrix product kernel + * This kernel performs the following computation: + * + * -# Convert a values from uint8 to int32 and add a_offset to each of them. + * -# Convert b values from uint8 to int32 and add b_offset to each of them. + * -# Compute the int32 matrix product of the resulting a * b. + * -# Add output_offset to each entry of the result. + * -# Multiply each entry of the result and round to the nearest integer + * -# Clamp the resulting int32 values to the [0..255] range and cast to uint8. + * + */ +class NEGEMMLowpMatrixMultiplyKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMLowpMatrixMultiplyKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpMatrixMultiplyKernel(const NEGEMMLowpMatrixMultiplyKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers)*/ + NEGEMMLowpMatrixMultiplyKernel &operator=(const NEGEMMLowpMatrixMultiplyKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMLowpMatrixMultiplyKernel(NEGEMMLowpMatrixMultiplyKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMLowpMatrixMultiplyKernel &operator=(NEGEMMLowpMatrixMultiplyKernel &&) = default; + /** Initialise the kernel's input and output. + * + * The input matrices @p input0 and @p input1 must be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel. These two + * kernels change the layout of the original matrices to be more cache-friendly. + * + * @param[in] input0 Input tensor containing the interleaved Matrix A. Data type supported: U8 + * @param[in] input1 Input tensor containing the transposed Matrix B. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication, Data type supported: same as @p input0 + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_offset Offset to be added to each element of the output matrix + * @param[in] output_mult_int Value to be multipied to each entry of the result. + * @param[in] shift Number of bits to shift right the result. + */ + void configure(const ITensor *input0, const ITensor *input1, ITensor *output, int32_t a_offset, int32_t b_offset, int32_t output_offset, int32_t output_mult_int, int32_t shift); + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input0; + const ITensor *_input1; + ITensor *_output; + int32_t _a_offset; + int32_t _b_offset; + int32_t _output_offset; + int32_t _output_mult_int; + int32_t _shift; +}; +} +#endif /*__ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYKERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h new file mode 100644 index 0000000000..c0ecafcd39 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAccumulateBiasesKernel.h @@ -0,0 +1,63 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; +/** NEON kernel to add a bias to each row of the input tensor */ +class NEGEMMMatrixAccumulateBiasesKernel : public INEKernel +{ +public: + /** Default constructor */ + NEGEMMMatrixAccumulateBiasesKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixAccumulateBiasesKernel(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixAccumulateBiasesKernel &operator=(const NEGEMMMatrixAccumulateBiasesKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMMatrixAccumulateBiasesKernel(NEGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMMatrixAccumulateBiasesKernel &operator=(NEGEMMMatrixAccumulateBiasesKernel &&) = default; + /** Default destructor */ + ~NEGEMMMatrixAccumulateBiasesKernel() = default; + /** Set the accumulate buffer and the biases of the kernel. + * + * @param[in, out] accum The accumulate tensor to convert. Data type supported: QS8/F32 + * @param[in] biases The shared biases tensor to append. It must be 1D Tensor. Data type supported: Same as @p input + */ + void configure(ITensor *accum, const ITensor *biases); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + ITensor *_accum; + const ITensor *_biases; +}; +} +#endif /*__ARM_COMPUTE_NEGEMMMATRIXACCUMULATEBIASESKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h new file mode 100644 index 0000000000..1ab52fa2f2 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixAdditionKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform the in-place matrix addition between 2 matrices taking into account that the second matrix might be weighted by a scalar value beta: + * + * @note [ MTX_OUT = MTX_0 + beta * MTX_1 ] with MTX_0 and MTX_1 of the same size + * + * @note This stage is used to finalize the GEMM result and it is computed if and only if beta != 0.0. In case this kernel is used for finalizing GEMM result, we have: + * - MTX_0 = A * B * alpha, where MTX_0 is the output of @ref NEGEMMMatrixMultiplyKernel + * - MTX_1 = C + */ +class NEGEMMMatrixAdditionKernel : public INESimpleKernel +{ +public: + /** Constructor */ + NEGEMMMatrixAdditionKernel(); + /** Prevent instances of this class from being copied */ + NEGEMMMatrixAdditionKernel(const NEGEMMMatrixAdditionKernel &) = delete; + /** Prevent instances of this class from being copied */ + NEGEMMMatrixAdditionKernel &operator=(const NEGEMMMatrixAdditionKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMMatrixAdditionKernel(NEGEMMMatrixAdditionKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMMatrixAdditionKernel &operator=(NEGEMMMatrixAdditionKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @note The input and output tensor must have the same dimensions + * + * @param[in] input Input tensor (Matrix C). Data types supported: QS8/F16/F32 + * @param[in, out] output Output tensor. If this kernel is used to finalize the GEMM result, output contains the result obtained by the kernel @ref NEGEMMMatrixMultiplyKernel. Data type supported: the same as @p input. + * @param[in] beta Weight of matrix C + */ + void configure(const ITensor *input, ITensor *output, float beta); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the matrix addition functions + * + * @param[in] input An input tensor. Data types supported: QS8/F16/F32 + * @param[out] output The output tensor. Data type supported: same as @p input + * @param[in] window Region on which to execute the kernel. + * @param[in] beta Weight of matrix C + */ + using MatrixAdditionFunction = void(const ITensor *input, ITensor *output, const Window &window, float beta); + /** Matrix addition function to use for the particular tensor types passed to configure() */ + MatrixAdditionFunction *_func; + float _beta; +}; +} +#endif /* __ARM_COMPUTE_NEGEMMMATRIXADDITIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h new file mode 100644 index 0000000000..a684945828 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMMatrixMultiplyKernel.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to multiply two input matrices "A" and "B". All elements of the output matrix/vector will be multiplied by alpha after the matrix multiplication + * + * @note If the output tensor is a matrix, the implementation assumes that the input tensors @p input0 and @p input1 are both matrices and reshaped respectively with @ref NEGEMMInterleave4x4Kernel" and @ref NEGEMMTranspose1xWKernel + * @note If the output tensor is a vector and the data type is F32, the implementation assumes that the first input tensor @p input0 is a vector and the second input tensor @p input1 a matrix. The implementation also assumes that both tensors have not been reshaped + * + */ +class NEGEMMMatrixMultiplyKernel : public INEKernel +{ +public: + /** Constructor */ + NEGEMMMatrixMultiplyKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixMultiplyKernel(const NEGEMMMatrixMultiplyKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGEMMMatrixMultiplyKernel &operator=(const NEGEMMMatrixMultiplyKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGEMMMatrixMultiplyKernel(NEGEMMMatrixMultiplyKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGEMMMatrixMultiplyKernel &operator=(NEGEMMMatrixMultiplyKernel &&) = default; + /** Initialise the kernel's input and output. + * + * @note If the output tensor is a matrix, the input matrices @p input0 and @p input1 should be the output of the kernels: @ref NEGEMMInterleave4x4Kernel and @ref NEGEMMTranspose1xWKernel + * These two kernels change the layout of the original matrices to be more cache-friendly. + * + * @param[in] input0 Input tensor containing the interleaved Matrix A or the vector A. Data types supported: F16/F32 + * @param[in] input1 Input tensor containing the transposed Matrix B if the first input tensor A is not a vector. + * If the output tensor is a vector, input1 must contain the matrix B not reshaped. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0. + * @param[in] alpha Weight of the matrix product + */ + void configure(const ITensor *input0, const ITensor *input1, ITensor *output, float alpha); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input0; + const ITensor *_input1; + ITensor *_output; + float _alpha; +}; +} +#endif /*__ARM_COMPUTE_NEGEMMMATRIXMULTIPLYKERNEL_H__*/ diff --git a/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h new file mode 100644 index 0000000000..5d8a3697cb --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGEMMTranspose1xWKernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__ +#define __ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel which transposes the elements of a matrix in chunks of 1xW, where W is equal to (16 / element size of the tensor) + * + * Following an example of how the transposition1xW works when the input data is F32 + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccccccccccc} + * a00 & a01 & a02 & a03 & a10 & a11 & a12 & a13 & a20 & a21 & a22 & a23 & a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + * + * Following an example of how the transposition1xW works when the input data type is F16 + * + * @f[ + * \left( \begin{array}{cccccccc} + * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a7 \\ + * a10 & a11 & a12 & a13 & a14 & a15 & a16 & 17 \\ + * a20 & a21 & a22 & a23 & a24 & a25 & a26 & 27 \\ + * a30 & a31 & a32 & a33 & a34 & a35 & a36 & 37 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{cccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccccc} + * a00 & a01 & a02 & a03 & a04 & a05 & a06 & a07 & a10 & a11 & a12 & a13 & a14 & a15 & a16 & a17 & a20 & a21 & a22 & a23 & a24 & a25 & a26 & a27 & a30 & a31 & a32 & a33 & a34 & a35 & a36 & a37\\ + * \end{array} \right) + * @f] + * + * @note The output matrix will have the following shape: [ height * W, ceil(width / W) ], where W = (16 / element size of the tensor) + * + */ +class NEGEMMTranspose1xWKernel : public INESimpleKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor. Data type supported: same as @p input. + */ + void configure(const ITensor *input, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; +}; +} +#endif /*__ARM_COMPUTE_NEGEMMTRANSPOSE1xWKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h new file mode 100644 index 0000000000..763fab88f6 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGaussian3x3Kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__ +#define __ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform a Gaussian 3x3 filter */ +class NEGaussian3x3Kernel : public INESimpleKernel +{ +public: + /** Set the source, destination and border mode of the kernel + * + * @param[in] input Source tensor. Data type supported: U8 + * @param[out] output Destination tensor. Data type supported: S16 + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; +}; +} +#endif /*__ARM_COMPUTE_NEGAUSSIAN3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h new file mode 100644 index 0000000000..86b28907da --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGaussian5x5Kernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__ +#define __ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform a Gaussian 5x5 filter (horizontal pass) */ +class NEGaussian5x5HorKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEGaussian5x5HorKernel(); + + /** Initialise the kernel's source, destination and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data type supported: S16. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + BorderSize _border_size; +}; + +/** NEON kernel to perform a Gaussian 5x5 filter (vertical pass) */ +class NEGaussian5x5VertKernel : public INESimpleKernel +{ +public: + /** Initialise the kernel's source, destination and border mode. + * + * @param[in] input Source tensor. Data type supported: S16. + * @param[out] output Destination tensor, Data type supported: U8. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; +}; +} +#endif /*__ARM_COMPUTE_NEGAUSSIAN5x5KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h new file mode 100644 index 0000000000..40a6aa7375 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEGaussianPyramidKernel.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ +#define __ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform a GaussianPyramid (horizontal pass) */ +class NEGaussianPyramidHorKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEGaussianPyramidHorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGaussianPyramidHorKernel(NEGaussianPyramidHorKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGaussianPyramidHorKernel &operator=(NEGaussianPyramidHorKernel &&) = default; + /** Default destructor */ + ~NEGaussianPyramidHorKernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data type supported: S16. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + BorderSize _border_size; + int _l2_load_offset; +}; + +/** NEON kernel to perform a GaussianPyramid (vertical pass) */ +class NEGaussianPyramidVertKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NEGaussianPyramidVertKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &) = delete; + /** Allow instances of this class to be moved */ + NEGaussianPyramidVertKernel(NEGaussianPyramidVertKernel &&) = default; + /** Allow instances of this class to be moved */ + NEGaussianPyramidVertKernel &operator=(NEGaussianPyramidVertKernel &&) = default; + /** Default destructor */ + ~NEGaussianPyramidVertKernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @param[in] input Source tensor. Data type supported: S16. + * @param[out] output Destination tensor. Data type supported: U8. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + int _t2_load_offset; +}; +} +#endif /*__ARM_COMPUTE_NEGAUSSIANPYRAMIDKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h new file mode 100644 index 0000000000..dd85778b8a --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEHOGDescriptorKernel.h @@ -0,0 +1,141 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__ +#define __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__ + +#include "arm_compute/core/IHOG.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Size2D.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform HOG Orientation Binning */ +class NEHOGOrientationBinningKernel : public INEKernel +{ +public: + /** Default constructor */ + NEHOGOrientationBinningKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHOGOrientationBinningKernel(const NEHOGOrientationBinningKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHOGOrientationBinningKernel &operator=(const NEHOGOrientationBinningKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHOGOrientationBinningKernel(NEHOGOrientationBinningKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHOGOrientationBinningKernel &operator=(NEHOGOrientationBinningKernel &&) = default; + /** Default destructor */ + ~NEHOGOrientationBinningKernel() = default; + + /** Initialise the kernel's inputs, output and HOG's metadata + * + * @param[in] input_magnitude Input tensor which stores the magnitude of the gradient for each pixel. Data type supported: S16. + * @param[in] input_phase Input tensor which stores the phase of the gradient for each pixel. Data type supported: U8 + * @param[out] output Output tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell + * @param[in] hog_info HOG's metadata + */ + void configure(const ITensor *input_magnitude, const ITensor *input_phase, ITensor *output, const HOGInfo *hog_info); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the specialised block normalization functions + * + * @param[in] mag_row_ptr Pointer to the first row of the cell in the magnitude tensor + * @param[in] phase_row_ptr Pointer to the first row of the cell in the phase tensor + * @param[out] output_ptr Pointer to the output cell of hog space tensor + * @param[in] mag_stride Stride of the magnitude tensor + * @param[in] phase_stride Stride of the phase tensor + * @param[in] cell_width Width of the cell + * @param[in] cell_height Height of the cell + * @param[in] num_bins Number of bins for each cell + * @param[in] phase_scale Scale factor to apply to the phase in order to calculate the histogram index + */ + using OrientBinFunc = void(const int16_t *__restrict mag_row_ptr, const uint8_t *__restrict phase_row_ptr, float *__restrict output_ptr, size_t mag_stride, size_t phase_stride, size_t cell_width, + size_t cell_height, size_t num_bins, float phase_scale); + /** Orientation binning function to use for the particular cell width passed to configure() */ + OrientBinFunc *_func; + const ITensor *_input_magnitude; + const ITensor *_input_phase; + ITensor *_output; + size_t _cell_width; + size_t _cell_height; + size_t _num_bins; + float _phase_scale; +}; + +/** NEON kernel to perform HOG block normalization */ +class NEHOGBlockNormalizationKernel : public INEKernel +{ +public: + /** Default constructor */ + NEHOGBlockNormalizationKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHOGBlockNormalizationKernel(const NEHOGBlockNormalizationKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHOGBlockNormalizationKernel &operator=(const NEHOGBlockNormalizationKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHOGBlockNormalizationKernel(NEHOGBlockNormalizationKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHOGBlockNormalizationKernel &operator=(NEHOGBlockNormalizationKernel &&) = default; + /** Default destructor */ + ~NEHOGBlockNormalizationKernel() = default; + + /** Initialise the kernel's input, output and HOG's metadata + * + * @param[in] input Input tensor which stores the local HOG for each cell. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per cell + * @param[out] output Output tensor which stores the normalised blocks. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block + * @param[in] hog_info HOG's metadata + */ + void configure(const ITensor *input, ITensor *output, const HOGInfo *hog_info); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the specialised block normalization functions + * + * @param[in] input_row_ptr Pointer to the first row of the block in the input hog space tensor + * @param[out] output_ptr Pointer to the output block of the hog normalized space + * @param[in] input_stride Stride of the input hog space tensor + * @param[in] num_cells_per_block_height Number of cells per block along the Y direction + * @param[in] num_bins_block_x Number of bins per block along the X direction + * @param[in] num_bins_block Number of total bins per block + * @param[in] l2_hyst_threshold Threshold to use for l2 hysteresis normalization + */ + using BlockNormFunc = void(const float *input_row_ptr, float *output_ptr, size_t input_stride, size_t num_cells_per_block_height, size_t num_bins_block_x, size_t num_bins_block, + float l2_hyst_threshold); + /** Block normalization function to use for the particular normalization type passed to configure() */ + BlockNormFunc *_func; + const ITensor *_input; + ITensor *_output; + Size2D _num_cells_per_block; + Size2D _num_cells_per_block_stride; + size_t _num_bins; + float _l2_hyst_threshold; +}; +} +#endif /* __ARM_COMPUTE_NEHOGDESCRIPTORKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h new file mode 100644 index 0000000000..e56d1e5fd8 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEHOGDetectorKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__ +#define __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__ + +#include "arm_compute/core/IArray.h" +#include "arm_compute/core/IHOG.h" +#include "arm_compute/core/NEON/INEKernel.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform HOG detector kernel using linear SVM */ +class NEHOGDetectorKernel : public INEKernel +{ +public: + /** Default constructor */ + NEHOGDetectorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHOGDetectorKernel(const NEHOGDetectorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHOGDetectorKernel &operator=(const NEHOGDetectorKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHOGDetectorKernel(NEHOGDetectorKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHOGDetectorKernel &operator=(NEHOGDetectorKernel &&) = default; + /** Default destructor */ + ~NEHOGDetectorKernel() = default; + + /** Initialise the kernel's input, HOG data-object, detection window, the stride of the detection window, the threshold and index of the object to detect + * + * @param[in] input Input tensor which stores the HOG descriptor obtained with @ref NEHOGOrientationBinningKernel. Data type supported: F32. Number of channels supported: equal to the number of histogram bins per block + * @param[in] hog HOG data object used by @ref NEHOGOrientationBinningKernel and @ref NEHOGBlockNormalizationKernel + * @param[out] detection_windows Array of @ref DetectionWindow. This array stores all the detected objects + * @param[in] detection_window_stride Distance in pixels between 2 consecutive detection windows in x and y directions. + * It must be multiple of the hog->info()->block_stride() + * @param[in] threshold (Optional) Threshold for the distance between features and SVM classifying plane + * @param[in] idx_class (Optional) Index of the class used for evaluating which class the detection window belongs to + */ + void configure(const ITensor *input, const IHOG *hog, IDetectionWindowArray *detection_windows, const Size2D &detection_window_stride, float threshold = 0.0f, uint16_t idx_class = 0); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input; + IDetectionWindowArray *_detection_windows; + const float *_hog_descriptor; + float _bias; + float _threshold; + uint16_t _idx_class; + size_t _num_bins_per_descriptor_x; + size_t _num_blocks_per_descriptor_y; + size_t _block_stride_width; + size_t _block_stride_height; + size_t _detection_window_width; + size_t _detection_window_height; + size_t _max_num_detection_windows; + std::mutex _mutex; +}; +} + +#endif /* __ARM_COMPUTE_NEHOGDETECTORKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h new file mode 100644 index 0000000000..0abd73ef97 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEHarrisCornersKernel.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ +#define __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ + +#include "arm_compute/core/CPP/kernels/CPPCornerCandidatesKernel.h" +#include "arm_compute/core/CPP/kernels/CPPSortEuclideanDistanceKernel.h" +#include "arm_compute/core/IArray.h" +#include "arm_compute/core/NEON/INEKernel.h" + +#include +#include + +namespace arm_compute +{ +class ITensor; +using IImage = ITensor; + +/** Common interface for all Harris Score kernels */ +class INEHarrisScoreKernel : public INEKernel +{ +public: + /** Default constructor */ + INEHarrisScoreKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + INEHarrisScoreKernel(const INEHarrisScoreKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + INEHarrisScoreKernel &operator=(const INEHarrisScoreKernel &) = delete; + /** Allow instances of this class to be moved */ + INEHarrisScoreKernel(INEHarrisScoreKernel &&) = default; + /** Allow instances of this class to be moved */ + INEHarrisScoreKernel &operator=(INEHarrisScoreKernel &&) = default; + /** Default destructor */ + ~INEHarrisScoreKernel() = default; + +public: + /** Setup the kernel parameters + * + * @param[in] input1 Source image (gradient X). Data types supported: S16/S32 + * @param[in] input2 Source image (gradient Y). Data types supported: same as @ input1 + * @param[out] output Destination image (harris score). Data types supported: F32 + * @param[in] norm_factor Normalization factor to use accordingly with the gradient size (Must be different from 0) + * @param[in] strength_thresh Minimum threshold with which to eliminate Harris Corner scores (computed using the normalized Sobel kernel). + * @param[in] sensitivity Sensitivity threshold k from the Harris-Stephens equation + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + virtual void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) = 0; + +protected: + const IImage *_input1; /**< Source image - Gx component */ + const IImage *_input2; /**< Source image - Gy component */ + IImage *_output; /**< Source image - Harris score */ + float _sensitivity; /**< Sensitivity value */ + float _strength_thresh; /**< Threshold value */ + float _norm_factor; /**< Normalization factor */ + BorderSize _border_size; /**< Border size */ +}; + +/** Template NEON kernel to perform Harris Score. + * The implementation supports 3, 5, and 7 for the block_size + */ +template +class NEHarrisScoreKernel : public INEHarrisScoreKernel +{ +public: + /** Default constructor */ + NEHarrisScoreKernel(); + // Inherited methods overridden: + void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override; + BorderSize border_size() const override; + void run(const Window &window) override; + +private: + /** Common signature for all the specialised harris score functions */ + using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float norm_factor, float sensitivity, float strength_thresh); + /** Harris Score function to use for the particular image types passed to configure() */ + HarrisScoreFunction *_func; +}; + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/** Interface for the accumulate Weighted kernel using F16 */ +template +class NEHarrisScoreFP16Kernel : public INEHarrisScoreKernel +{ +public: + /** Default constructor */ + NEHarrisScoreFP16Kernel(); + // Inherited methods overridden: + void configure(const IImage *input1, const IImage *input2, IImage *output, float norm_factor, float strength_thresh, float sensitivity, bool border_undefined) override; + BorderSize border_size() const override; + void run(const Window &window) override; + +private: + using HarrisScoreFunction = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int32_t input_stride, + float norm_factor, float sensitivity, float strength_thresh); + /** Harris Score function to use for the particular image types passed to configure() */ + HarrisScoreFunction *_func; +}; +#else +template +using NEHarrisScoreFP16Kernel = NEHarrisScoreKernel; +#endif +} +#endif /* __ARM_COMPUTE_NEHARRISCORNERSKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEHistogramKernel.h b/arm_compute/core/NEON/kernels/NEHistogramKernel.h new file mode 100644 index 0000000000..c4dbbeae83 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEHistogramKernel.h @@ -0,0 +1,129 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ +#define __ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +#include +#include +#include + +namespace arm_compute +{ +class IDistribution1D; +class ITensor; +using IImage = ITensor; + +/** Interface for the histogram kernel */ +class NEHistogramKernel : public INEKernel +{ +public: + /** Default constructor */ + NEHistogramKernel(); + /** Default destructor */ + ~NEHistogramKernel() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHistogramKernel(const NEHistogramKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEHistogramKernel &operator=(const NEHistogramKernel &) = delete; + /** Allow instances of this class to be moved */ + NEHistogramKernel(NEHistogramKernel &&) = default; + /** Allow instances of this class to be moved */ + NEHistogramKernel &operator=(NEHistogramKernel &&) = default; + + /** Set the input image and the distribution output. + * + * @param[in] input Source image. Data type supported: U8. + * @param[out] output Destination distribution. + * @param[in,out] local_hist Array that the threads use to save their local histograms. + * It's size should be equal to (number_of_threads * num_bins), + * and the Window::thread_id() is used to determine the part of the array + * used by each thread. + * @param[out] window_lut LUT with pre-calculated possible window values. + * The size of the LUT should be equal to max_range_size and it will be filled + * during the configure stage, while it re-used in every run, therefore can be + * safely shared among threads. + */ + void configure(const IImage *input, IDistribution1D *output, uint32_t *local_hist, uint32_t *window_lut); + /** Set the input image and the distribution output. + * + * @note Used for histogram of fixed size equal to 256 + * + * @param[in] input Source image. Data type supported: U8. + * @param[out] output Destination distribution which must be of 256 bins.. + */ + void configure(const IImage *input, IDistribution1D *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Function to merge multiple partial histograms. + * + * @param[out] global_hist Pointer to the final histogram. + * @param[in] local_hist Pointer to the partial histograms. + * @param[in] bins Number of bins. + */ + void merge_histogram(uint32_t *global_hist, const uint32_t *local_hist, size_t bins); + /** Function to merge multiple minimum values of partial histograms. + * + * @param[out] global_min Pointer to the global min value. + * @param[in] local_min Local min value. + */ + void merge_min(uint8_t *global_min, const uint8_t &local_min); + /** Function to perform histogram on the given window + * + * @param[in] win Region on which to execute the kernel + */ + void histogram_U8(Window win); + /** Function to perform histogram on the given window where histogram is + * of fixed size 256 without ranges and offsets. + * + * @param[in] win Region on which to execute the kernel + */ + void histogram_fixed_U8(Window win); + /** Pre-calculate the pixel windowing for every possible pixel + * + * Calculate (V - offset) * numBins / range where V is every possible pixel value. + * + * @note We currently support U8 image thus possible pixel values are between 0 and 255 + */ + void calculate_window_lut() const; + /** Common signature for all the specialised Histogram functions + * + * @param[in] window Region on which to execute the kernel. + */ + using HistogramFunctionPtr = void (NEHistogramKernel::*)(Window window); + + HistogramFunctionPtr _func; ///< Histogram function to use for the particular image types passed to configure() + const IImage *_input; + IDistribution1D *_output; + uint32_t *_local_hist; + uint32_t *_window_lut; + std::mutex _hist_mtx; + static constexpr unsigned int _max_range_size{ 256 }; ///< 256 possible pixel values as we handle only U8 images +}; +} +#endif /*__ARM_COMPUTE_NEHISTOGRAMKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEIm2ColKernel.h b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h new file mode 100644 index 0000000000..ebaafb467f --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEIm2ColKernel.h @@ -0,0 +1,114 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEIM2COLKERNEL_H__ +#define __ARM_COMPUTE_NEIM2COLKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the im2col reshape kernel. + * + * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column. + * It is used to transform a convolution to a plain matrix multiplication. + * + * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have: + * + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccc} + * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\ + * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\ + * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\ + * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + */ +class NEIm2ColKernel : public INEKernel +{ +public: + /** Default constructor */ + NEIm2ColKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEIm2ColKernel(const NEIm2ColKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete; + /** Allow instances of this class to be moved */ + NEIm2ColKernel(NEIm2ColKernel &&) = default; + /** Allow instances of this class to be moved */ + NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default; + /** Default destructor */ + ~NEIm2ColKernel() = default; + + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QS8/F32 + * @param[out] output The output tensor. Data types supported: Same as @p input + * @param[in] convolved_dims The convolved output dimensions. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + */ + void configure(const ITensor *input, ITensor *output, std::pair convolved_dims, const PadStrideInfo &conv_info, bool has_bias); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Template function to run the im2col optimised for the fully connected layer case + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template + void run_reduced(const Window &window); + /** Template function to run the im2col used for the convolution layer case + * + * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()). + */ + template + void run_generic(const Window &window); + /** Common signature for all the specialised im2col functions + * + * @param[in] window Region on which to execute the kernel. + */ + using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window); + + Im2ColFunctionPtr _func; + const ITensor *_input; + ITensor *_output; + std::pair _convolved_dims; + PadStrideInfo _conv_info; + unsigned int _kernel_size; + bool _has_bias; +}; +} +#endif /*__ARM_COMPUTE_NEIM2COLKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h new file mode 100644 index 0000000000..13647889ab --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEIntegralImageKernel.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__ +#define __ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Kernel to perform an image integral on an image */ +class NEIntegralImageKernel : public INESimpleKernel +{ +public: + /** Set the source, destination and border mode of the kernel + * + * @param[in] input Source tensor. Data type supported: U8 + * @param[out] output Destination tensor. Data type supported: U32 + */ + void configure(const ITensor *input, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + bool is_parallelisable() const override; +}; +} +#endif /*__ARM_COMPUTE_NEINTEGRALIMAGEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NELKTrackerKernel.h b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h new file mode 100644 index 0000000000..9ab7f91092 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NELKTrackerKernel.h @@ -0,0 +1,144 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_LKTRACKERKERNEL_H__ +#define __ARM_COMPUTE_LKTRACKERKERNEL_H__ + +#include "arm_compute/core/IArray.h" +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +#include +#include +#include +#include + +namespace arm_compute +{ +class ITensor; + +/** Internal keypoint class for Lucas-Kanade Optical Flow */ +struct NELKInternalKeypoint +{ + float x{ 0.f }; /**< x coordinate of the keypoint */ + float y{ 0.f }; /**< y coordinate of the keypoint */ + bool tracking_status{ false }; /**< the tracking status of the keypoint */ +}; + +using INELKInternalKeypointArray = IArray; + +/** Interface for the Lucas-Kanade tracker kernel */ +class NELKTrackerKernel : public INEKernel +{ +public: + /** Default constructor */ + NELKTrackerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELKTrackerKernel(const NELKTrackerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELKTrackerKernel &operator=(const NELKTrackerKernel &) = delete; + /** Allow instances of this class to be moved */ + NELKTrackerKernel(NELKTrackerKernel &&) = default; + /** Allow instances of this class to be moved */ + NELKTrackerKernel &operator=(NELKTrackerKernel &&) = default; + /** Default destructor */ + ~NELKTrackerKernel() = default; + + /** Initialise the kernel input and output + * + * @param[in] input_old Pointer to the input old tensor. Data type supported: U8 + * @param[in] input_new Pointer to the input new tensor. Data type supported. U8 + * @param[in] old_scharr_gx Pointer to the input scharr X tensor. Data type supported: S16 + * @param[in] old_scharr_gy Pointer to the input scharr Y tensor. Data type supported: S16 + * @param[in] old_points Pointer to the IKeyPointArray storing old key points + * @param[in] new_points_estimates Pointer to the IKeyPointArray storing new estimates key points + * @param[out] new_points Pointer to the IKeyPointArray storing new key points + * @param[in, out] old_points_internal Pointer to the array of NELKInternalKeypoint for old points + * @param[out] new_points_internal Pointer to the array of NELKInternalKeypoint for new points + * @param[in] termination The criteria to terminate the search of each keypoint. + * @param[in] use_initial_estimate The flag to indicate whether the initial estimated position should be used + * @param[in] epsilon The error for terminating the algorithm + * @param[in] num_iterations The maximum number of iterations before terminate the algorithm + * @param[in] window_dimension The size of the window on which to perform the algorithm + * @param[in] level The pyramid level + * @param[in] num_levels The number of pyramid levels + * @param[in] pyramid_scale Scale factor used for generating the pyramid + */ + void configure(const ITensor *input_old, const ITensor *input_new, const ITensor *old_scharr_gx, const ITensor *old_scharr_gy, + const IKeyPointArray *old_points, const IKeyPointArray *new_points_estimates, IKeyPointArray *new_points, + INELKInternalKeypointArray *old_points_internal, INELKInternalKeypointArray *new_points_internal, + Termination termination, bool use_initial_estimate, float epsilon, unsigned int num_iterations, size_t window_dimension, + size_t level, size_t num_levels, float pyramid_scale); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Initialise the array of keypoints in the provide range + * + * @param[in] start Index of first element in the keypoints array to be initialised + * @param[in] end Index after last elelemnt in the keypoints array to be initialised + */ + void init_keypoints(int start, int end); + /** Compute the structure tensor A^T * A based on the scharr gradients I_x and I_y + * + * @param[in] keypoint Keypoint for which gradients are computed + * @param[out] bilinear_ix Intermediate interpolated data for X gradient + * @param[out] bilinear_iy Intermediate interpolated data for Y gradient + * + * @return Values A11, A12, A22 + */ + std::tuple compute_spatial_gradient_matrix(const NELKInternalKeypoint &keypoint, int *bilinear_ix, int *bilinear_iy); + /** Compute the vector A^T * b, i.e. -sum(I_d * I_t) for d in {x,y} + * + * @param[in] old_keypoint Old keypoint for which gradient is computed + * @param[in] new_keypoint New keypoint for which gradient is computed + * @param[in] bilinear_ix Intermediate interpolated data for X gradient + * @param[in] bilinear_iy Intermediate interpolated data for Y gradient + * + * @return Values b1, b2 + */ + std::pair compute_image_mismatch_vector(const NELKInternalKeypoint &old_keypoint, const NELKInternalKeypoint &new_keypoint, const int *bilinear_ix, const int *bilinear_iy); + + const ITensor *_input_old; + const ITensor *_input_new; + const ITensor *_old_scharr_gx; + const ITensor *_old_scharr_gy; + IKeyPointArray *_new_points; + const IKeyPointArray *_new_points_estimates; + const IKeyPointArray *_old_points; + INELKInternalKeypointArray *_old_points_internal; + INELKInternalKeypointArray *_new_points_internal; + Termination _termination; + bool _use_initial_estimate; + float _pyramid_scale; + float _epsilon; + unsigned int _num_iterations; + int _window_dimension; + unsigned int _level; + unsigned int _num_levels; + ValidRegion _valid_region; +}; +} +#endif /*__ARM_COMPUTE_NELKTRACKERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h new file mode 100644 index 0000000000..d4bff661f9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NELocallyConnectedMatrixMultiplyKernel.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ +#define __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to multiply each row of first tensor with low 2 dimensions of second tensor. */ +class NELocallyConnectedMatrixMultiplyKernel : public INEKernel +{ +public: + /** Default constructor */ + NELocallyConnectedMatrixMultiplyKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELocallyConnectedMatrixMultiplyKernel(const NELocallyConnectedMatrixMultiplyKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELocallyConnectedMatrixMultiplyKernel &operator=(const NELocallyConnectedMatrixMultiplyKernel &) = delete; + /** Allow instances of this class to be moved */ + NELocallyConnectedMatrixMultiplyKernel(NELocallyConnectedMatrixMultiplyKernel &&) = default; + /** Allow instances of this class to be moved */ + NELocallyConnectedMatrixMultiplyKernel &operator=(NELocallyConnectedMatrixMultiplyKernel &&) = default; + /** Initialise the kernel's input and output + * + * @param[in] input0 First input tensor. Data types supported: F32 + * @param[in] input1 Second input tensor containing the Matrix B. Data type supported: same as @p input0 + * @param[out] output Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0 + */ + void configure(const ITensor *input0, const ITensor *input1, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const ITensor *_input0; + const ITensor *_input1; + ITensor *_output; +}; +} +#endif /* __ARM_COMPUTE_NELOCALLYCONNECTEDMATRIXMULTIPLYKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h new file mode 100644 index 0000000000..5d49901dd0 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEMagnitudePhaseKernel.h @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ +#define __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Template interface for the kernel to compute magnitude and phase */ +template +class NEMagnitudePhaseKernel : public INEKernel +{ +public: + /** Default constructor */ + NEMagnitudePhaseKernel(); + /** Destructor */ + ~NEMagnitudePhaseKernel() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMagnitudePhaseKernel(const NEMagnitudePhaseKernel &) = delete; + /** Default move constructor */ + NEMagnitudePhaseKernel(NEMagnitudePhaseKernel &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMagnitudePhaseKernel &operator=(const NEMagnitudePhaseKernel &) = delete; + /** Default move assignment operator */ + NEMagnitudePhaseKernel &operator=(NEMagnitudePhaseKernel &&) = default; + + /** Initialise the kernel's input, output. + * + * @note At least one of out1 or out2 must be set + * + * @param[in] gx Gradient X tensor. Data type supported: S16. + * @param[in] gy Gradient Y tensor. Data type supported: S16. + * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16. + * @param[out] phase (Optional) The output tensor - Phase. Data type supported: U8. + */ + void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Function to perform magnitude on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void magnitude(const Window &window); + /** Function to perform phase on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void phase(const Window &window); + /** Function to perform magnitude and phase on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void magnitude_phase(const Window &window); + +private: + /** Common signature for all the specialised MagnitudePhase functions + * + * @param[in] window Region on which to execute the kernel. + */ + using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseKernel::*)(const Window &window); + /** MagnitudePhase function to use for the particular formats passed to configure() */ + MagnitudePhaseFunctionPtr _func; + const ITensor *_gx; /**< Input gradient X */ + const ITensor *_gy; /**< Input gradient Y */ + ITensor *_magnitude; /**< Output - Magnitude */ + ITensor *_phase; /**< Output - Phase */ +}; + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/** Template interface for the kernel to compute magnitude and phase */ +template +class NEMagnitudePhaseFP16Kernel : public INEKernel +{ +public: + /** Default constructor */ + NEMagnitudePhaseFP16Kernel(); + /** Destructor */ + ~NEMagnitudePhaseFP16Kernel() = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMagnitudePhaseFP16Kernel(const NEMagnitudePhaseFP16Kernel &) = delete; + /** Default move constructor */ + NEMagnitudePhaseFP16Kernel(NEMagnitudePhaseFP16Kernel &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMagnitudePhaseFP16Kernel &operator=(const NEMagnitudePhaseFP16Kernel &) = delete; + /** Default move assignment operator */ + NEMagnitudePhaseFP16Kernel &operator=(NEMagnitudePhaseFP16Kernel &&) = default; + + /** Initialise the kernel's input, output. + * + * @note At least one of out1 or out2 must be set + * + * @param[in] gx Gradient X tensor. Data type supported: S16. + * @param[in] gy Gradient Y tensor. Data type supported: S16. + * @param[out] magnitude (Optional) The output tensor - Magnitude. Data type supported: S16. + * @param[out] phase (Optional) The output tensor - Phase. Data type supported: U8. + */ + void configure(const ITensor *gx, const ITensor *gy, ITensor *magnitude, ITensor *phase); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Function to perform magnitude on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void magnitude(const Window &window); + /** Function to perform phase on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void phase(const Window &window); + /** Function to perform magnitude and phase on the given window + * + * @param[in] window Region on which to execute the kernel + */ + void magnitude_phase(const Window &window); + + /** Common signature for all the specialised MagnitudePhase functions + * + * @param[in] window Region on which to execute the kernel. + */ + using MagnitudePhaseFunctionPtr = void (NEMagnitudePhaseFP16Kernel::*)(const Window &window); + /** MagnitudePhase function to use for the particular formats passed to configure() */ + MagnitudePhaseFunctionPtr _func; + const ITensor *_gx; /**< Input gradient X */ + const ITensor *_gy; /**< Input gradient Y */ + ITensor *_magnitude; /**< Output - Magnitude */ + ITensor *_phase; /**< Output - Phase */ +}; +#else +template +using NEMagnitudePhaseFP16Kernel = NEMagnitudePhaseKernel; +#endif +} +#endif /* __ARM_COMPUTE_NEMAGNITUDEPHASEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h new file mode 100644 index 0000000000..83407ccb7d --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEMeanStdDevKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__ +#define __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +#include +#include + +namespace arm_compute +{ +class ITensor; +using IImage = ITensor; + +/** Interface for the kernel to calculate mean and standard deviation of input image pixels. */ +class NEMeanStdDevKernel : public INEKernel +{ +public: + /** Default constructor */ + NEMeanStdDevKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMeanStdDevKernel(const NEMeanStdDevKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMeanStdDevKernel &operator=(const NEMeanStdDevKernel &) = delete; + /** Allow instances of this class to be moved */ + NEMeanStdDevKernel(NEMeanStdDevKernel &&) = default; + /** Allow instances of this class to be moved */ + NEMeanStdDevKernel &operator=(NEMeanStdDevKernel &&) = default; + /** Default destructor */ + ~NEMeanStdDevKernel() = default; + + /** Initialise the kernel's input and outputs. + * + * @param[in] input Input image. Data type supported: U8. + * @param[out] mean Input average pixel value. + * @param[out] global_sum Keeps global sum of pixel values. + * @param[out] stddev (Optional) Output standard deviation of pixel values. + * @param[out] global_sum_squared (Optional if stddev is not set, required if stddev is set) Keeps global sum of squared pixel values. + */ + void configure(const IImage *input, float *mean, uint64_t *global_sum, float *stddev = nullptr, uint64_t *global_sum_squared = nullptr); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + const IImage *_input; + float *_mean; + float *_stddev; + uint64_t *_global_sum; + uint64_t *_global_sum_squared; + std::mutex _mtx; +}; +} +#endif /* __ARM_COMPUTE_NEMEANSTDDEVKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h new file mode 100644 index 0000000000..dee1aadfb9 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEMedian3x3Kernel.h @@ -0,0 +1,50 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__ +#define __ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Kernel to perform a median filter on a tensor */ +class NEMedian3x3Kernel : public INESimpleKernel +{ +public: + /** Set the source, destination and border mode of the kernel + * + * @param[in] input Source tensor. Data type supported: U8 + * @param[out] output Destination tensor. Data type supported: U8 + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; +}; +} +#endif /*__ARM_COMPUTE_NEMEDIAN3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h new file mode 100644 index 0000000000..e405ea5ae4 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEMinMaxLocationKernel.h @@ -0,0 +1,161 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__ +#define __ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__ + +#include "arm_compute/core/IArray.h" +#include "arm_compute/core/NEON/INEKernel.h" + +#include +#include + +namespace arm_compute +{ +class ITensor; +using IImage = ITensor; + +/** Interface for the kernel to perform min max search on an image. */ +class NEMinMaxKernel : public INEKernel +{ +public: + /** Default constructor */ + NEMinMaxKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMinMaxKernel(const NEMinMaxKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMinMaxKernel &operator=(const NEMinMaxKernel &) = delete; + /** Allow instances of this class to be moved */ + NEMinMaxKernel(NEMinMaxKernel &&) = default; + /** Allow instances of this class to be moved */ + NEMinMaxKernel &operator=(NEMinMaxKernel &&) = default; + /** Default destructor */ + ~NEMinMaxKernel() = default; + + /** Initialise the kernel's input and outputs. + * + * @param[in] input Input Image. Data types supported: U8/S16. + * @param[out] min Minimum value of image. + * @param[out] max Maximum value of image. + */ + void configure(const IImage *input, int32_t *min, int32_t *max); + /** Resets global minimum and maximum. */ + void reset(); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Performs the min/max algorithm on U8 images on a given window. + * + * @param win The window to run the algorithm on. + */ + void minmax_U8(const Window &win); + /** Performs the min/max algorithm on S16 images on a given window. + * + * @param win The window to run the algorithm on. + */ + void minmax_S16(const Window &win); + /** Common signature for all the specialised MinMax functions + * + * @param[in] window Region on which to execute the kernel. + */ + using MinMaxFunction = void (NEMinMaxKernel::*)(const Window &window); + /** MinMax function to use for the particular image types passed to configure() */ + MinMaxFunction _func; + /** Helper to update min/max values **/ + template + void update_min_max(T min, T max); + + const IImage *_input; /**< Input image. */ + int32_t *_min; /**< Minimum value. */ + int32_t *_max; /**< Maximum value. */ + int32_t _min_init; /**< Value to initialise global minimum value. */ + int32_t _max_init; /**< Value to initialise global maximum value. */ + std::mutex _mtx; /**< Mutex used for result reduction. */ +}; + +/** Interface for the kernel to find min max locations of an image. */ +class NEMinMaxLocationKernel : public INEKernel +{ +public: + /** Default constructor */ + NEMinMaxLocationKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMinMaxLocationKernel(const NEMinMaxLocationKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEMinMaxLocationKernel &operator=(const NEMinMaxLocationKernel &) = delete; + /** Allow instances of this class to be moved */ + NEMinMaxLocationKernel(NEMinMaxLocationKernel &&) = default; + /** Allow instances of this class to be moved */ + NEMinMaxLocationKernel &operator=(NEMinMaxLocationKernel &&) = default; + /** Default destructor */ + ~NEMinMaxLocationKernel() = default; + + /** Initialise the kernel's input and outputs. + * + * @param[in] input Input Image. Data types supported: U8 or S16. + * @param[out] min Minimum value of image. + * @param[out] max Maximum value of image. + * @param[out] min_loc Array of minimum value locations. + * @param[out] max_loc Array of maximum value locations. + * @param[out] min_count Number of minimum value encounters. + * @param[out] max_count Number of maximum value encounters. + */ + void configure(const IImage *input, int32_t *min, int32_t *max, + ICoordinates2DArray *min_loc = nullptr, ICoordinates2DArray *max_loc = nullptr, + uint32_t *min_count = nullptr, uint32_t *max_count = nullptr); + + // Inherited methods overridden: + void run(const Window &window) override; + bool is_parallelisable() const override; + +private: + /** Performs the min/max location algorithm on T type images on a given window. + * + * @param win The window to run the algorithm on. + */ + template + void minmax_loc(const Window &win); + /** Common signature for all the specialised MinMaxLoc functions + * + * @param[in] window Region on which to execute the kernel. + */ + using MinMaxLocFunction = void (NEMinMaxLocationKernel::*)(const Window &window); + /** MinMaxLoc function to use for the particular image types passed to configure() */ + MinMaxLocFunction _func; + /** Helper to create a function pointer table for the parameterized MinMaxLocation functions. */ + template + struct create_func_table; + + const IImage *_input; /**< Input image. */ + int32_t *_min; /**< Minimum value. */ + int32_t *_max; /**< Maximum value. */ + uint32_t *_min_count; /**< Count of minimum value encounters. */ + uint32_t *_max_count; /**< Count of maximum value encounters. */ + ICoordinates2DArray *_min_loc; /**< Locations of minimum values. */ + ICoordinates2DArray *_max_loc; /**< Locations of maximum values. */ + unsigned int _num_elems_processed_per_iteration; /**< Elements processed per iteration. */ +}; +} +#endif /*__ARM_COMPUTE_NEMINMAXLOCATIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h new file mode 100644 index 0000000000..ede0294a73 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NENonLinearFilterKernel.h @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__ +#define __ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to apply a non-linear filter */ +class NENonLinearFilterKernel : public INEKernel +{ +public: + /** Default constructor */ + NENonLinearFilterKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENonLinearFilterKernel(NENonLinearFilterKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &) = delete; + /** Allow instances of this class to be moved */ + NENonLinearFilterKernel(NENonLinearFilterKernel &&) = default; + /** Allow instances of this class to be moved */ + NENonLinearFilterKernel &operator=(NENonLinearFilterKernel &&) = default; + /** Set the source, destination and border mode of the kernel + * + * @param[in] input Source tensor. Data type supported: U8 + * @param[out] output Destination tensor. Data type supported: U8 + * @param[in] function Non linear function to perform + * @param[in] mask_size Mask size. Supported sizes: 3, 5 + * @param[in] pattern Mask pattern + * @param[in] mask The given mask. Will be used only if pattern is specified to PATTERN_OTHER + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, NonLinearFilterFunction function, unsigned int mask_size, MatrixPattern pattern, const uint8_t *mask, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Fill mask with the corresponding given pattern. + * + * @param[in,out] mask Mask to be filled according to pattern + * @param[in] cols Columns (width) of mask + * @param[in] rows Rows (height) of mask + * @param[in] pattern Pattern to fill the mask according to + */ + void fill_mask(uint8_t *mask, int cols, int rows, MatrixPattern pattern); + /** Apply a median filter when given mask pattern is defined as box. + * + * @param[in] win Window to apply the filter on. + */ + template + void median_filter_box(const Window &win); + /** Apply a min filter when given mask pattern is defined as box. + * + * @param[in] win Window to apply the filter on. + */ + template + void min_filter_box(const Window &win); + /** Apply a max filter when given mask pattern is defined as box. + * + * @param[in] win Window to apply the filter on. + */ + template + void max_filter_box(const Window &win); + /** Apply a median filter when given mask pattern is defined as cross. + * + * @param[in] win Window to apply the filter on. + */ + template + void median_filter_cross(const Window &win); + /** Apply a min filter when given mask pattern is defined as cross. + * + * @param[in] win Window to apply the filter on. + */ + template + void min_filter_cross(const Window &win); + /** Apply a max filter when given mask pattern is defined as cross. + * + * @param[in] win Window to apply the filter on. + */ + template + void max_filter_cross(const Window &win); + /** Apply a median filter when given mask pattern is defined as disk. + * + * @param[in] win Window to apply the filter on. + */ + template + void median_filter_disk(const Window &win); + /** Apply a min filter when given mask pattern is defined as disk. + * + * @param[in] win Window to apply the filter on. + */ + template + void min_filter_disk(const Window &win); + /** Apply a max filter when given mask pattern is defined as disk. + * + * @param[in] win Window to apply the filter on. + */ + template + void max_filter_disk(const Window &win); + /** Apply a non-linear filter when given mask has user-defined pattern. + * + * @param[in] win Window to apply the filter on. + */ + template + void non_linear_filter_generic(const Window &win); + +private: + unsigned int _border_width; + const ITensor *_input; + ITensor *_output; + const uint8_t *_mask; + MatrixPattern _pattern; + NonLinearFilterFunction _function; + unsigned int _func_idx; + BorderSize _border_size; +}; +} +#endif /*__ARM_COMPUTE_NENONLINEARFILTERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h new file mode 100644 index 0000000000..0daae59e54 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NENonMaximaSuppression3x3Kernel.h @@ -0,0 +1,99 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ +#define __ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** Interface to perform Non-Maxima suppression over a 3x3 window using NEON + * + * @note Used by @ref NEFastCorners and @ref NEHarrisCorners + */ +class NENonMaximaSuppression3x3Kernel : public INEKernel +{ +public: + /** Default constructor */ + NENonMaximaSuppression3x3Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENonMaximaSuppression3x3Kernel(const NENonMaximaSuppression3x3Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENonMaximaSuppression3x3Kernel &operator=(const NENonMaximaSuppression3x3Kernel &) = delete; + /** Allow instances of this class to be moved */ + NENonMaximaSuppression3x3Kernel(NENonMaximaSuppression3x3Kernel &&) = default; + /** Allow instances of this class to be moved */ + NENonMaximaSuppression3x3Kernel &operator=(NENonMaximaSuppression3x3Kernel &&) = default; + /** Default destructor */ + ~NENonMaximaSuppression3x3Kernel() = default; + + /** Initialise the kernel's sources, destinations and border mode. + * + * @param[in] input Source tensor. Data types supported: U8/F32 + * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +protected: + /** Common signature for all the specialised non-maxima suppression 3x3 functions + * + * @param[in] input_ptr Pointer to the input tensor. + * @param[out] output_ptr Pointer to the output tensor + * @param[in] input_stride Stride of the input tensor + */ + using NonMaxSuppr3x3Function = void(const void *__restrict input_ptr, void *__restrict output_ptr, const uint32_t input_stride); + + NonMaxSuppr3x3Function *_func; /**< Non-Maxima suppression function to use for the particular tensor types passed to configure() */ + const ITensor *_input; /**< Source tensor */ + ITensor *_output; /**< Destination tensor */ +}; + +#ifdef ARM_COMPUTE_ENABLE_FP16 +/** NEON kernel to perform Non-Maxima suppression 3x3 with intermediate results in F16 if the input data type is F32 + */ +class NENonMaximaSuppression3x3FP16Kernel : public NENonMaximaSuppression3x3Kernel +{ +public: + /** Initialise the kernel's sources, destinations and border mode. + * + * @param[in] input Source tensor. Data types supported: U8/F32. + * @param[out] output Destination tensor. Data types supported: same as @p input + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output, bool border_undefined); +}; +#else +using NENonMaximaSuppression3x3FP16Kernel = NENonMaximaSuppression3x3Kernel; +#endif +} +#endif /* _ARM_COMPUTE_NENONMAXIMASUPPRESSION3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h new file mode 100644 index 0000000000..d4e36d5ff1 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NENormalizationLayerKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__ +#define __ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the normalization layer kernel. + */ +class NENormalizationLayerKernel : public INEKernel +{ +public: + /** Default constructor */ + NENormalizationLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENormalizationLayerKernel(const NENormalizationLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NENormalizationLayerKernel &operator=(const NENormalizationLayerKernel &) = delete; + /** Default Move Constructor. */ + NENormalizationLayerKernel(NENormalizationLayerKernel &&) = default; + /** Default move assignment operator. */ + NENormalizationLayerKernel &operator=(NENormalizationLayerKernel &&) = default; + /** Default destructor */ + ~NENormalizationLayerKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. 3 lower dims represent a single input with dimensions [width, height, IFM], + * and an optional 4th dimension for batch of inputs. Data types supported: QS8/F32. + * @param[in] input_squared Source with each element has been squared. 3 lower dims represent a single input with dimensions [width, height, IFM], + * Data type supported: same as @p input + * @param[out] output Destination tensor. Output will have the same number of dimensions as input. Data type supported: same as @p input + * @param[in] norm_info Normalization layer information like the normalization type, normalization size and other parameters. + */ + void configure(const ITensor *input, const ITensor *input_squared, ITensor *output, NormalizationLayerInfo norm_info); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Function to perform normalization depending on the given template + * dimension. The second template parameter specifies whether the + * normalization has to be 1D or 2D. + * + * @note Only supported normalizations are: + * - 1D over X or Z + * - 2D over X and Y + * + * @param[in] window Region on which to execute the kernel. + */ + template + void normalize(const Window &window); + + /** Function to perform normalization for fixed-point values depending on + * the given template dimension. The second template parameter specifies + * whether the normalization has to be 1D or 2D. + * + * @note Only supported normalizations are: + * - 1D over X or Z + * - 2D over X and Y + * + * @param[in] window Region on which to execute the kernel. + */ + template + void normalize_fixed_point(const Window &window); + /** Common signature for all the specialised normalization functions + * + * @param[in] window Region on which to execute the kernel. + */ + using NormalizationFunction = void (NENormalizationLayerKernel::*)(const Window &window); + +private: + NormalizationFunction _func; + const ITensor *_input; + const ITensor *_input_squared; + ITensor *_output; + NormalizationLayerInfo _norm_info; + BorderSize _border_size; +}; +} +#endif /*__ARM_COMPUTE_NENORMALIZATIONLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h new file mode 100644 index 0000000000..7e402cd220 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEPixelWiseMultiplicationKernel.h @@ -0,0 +1,105 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__ +#define __ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to perform addition between two tensors */ +class NEPixelWiseMultiplicationKernel : public INEKernel +{ +public: + /** Default constructor */ + NEPixelWiseMultiplicationKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPixelWiseMultiplicationKernel(const NEPixelWiseMultiplicationKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPixelWiseMultiplicationKernel &operator=(const NEPixelWiseMultiplicationKernel &) = delete; + /** Allow instances of this class to be moved */ + NEPixelWiseMultiplicationKernel(NEPixelWiseMultiplicationKernel &&) = default; + /** Allow instances of this class to be moved */ + NEPixelWiseMultiplicationKernel &operator=(NEPixelWiseMultiplicationKernel &&) = default; + /** Default destructor */ + ~NEPixelWiseMultiplicationKernel() = default; + /** Initialise the kernel's input, output and border mode. + * + * @note For @p scale equal to 1/255 only round to nearest even (implemented as round half up) is supported. + * For all other scale values only round to zero (implemented as round towards minus infinity) is supported. + * + * @param[in] input1 An input tensor. Data types supported: U8/QS8/S16/F32. + * @param[in] input2 An input tensor. Data types supported: U8/QS8/S16/F32. + * @param[out] output The output tensor. Data types supported: U8 (Only if both inputs are U8) /S16/F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. + * @param[in] rounding_policy Rounding policy. + */ + void configure(const ITensor *input1, const ITensor *input2, ITensor *output, float scale, ConvertPolicy overflow_policy, RoundingPolicy rounding_policy); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the specialised multiplication functions with integer scaling factor + * + * @param[in] input1_ptr Pointer to the first input tensor. + * @param[in] input2_ptr Pointer to the second input tensor. + * @param[out] output_ptr Pointer to the output tensor. + */ + using MulFunctionInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale); + /** Common signature for all the specialised multiplication functions with fixed-point values + * + * @param[in] input1_ptr Pointer to the first input tensor. + * @param[in] input2_ptr Pointer to the second input tensor. + * @param[in] scale Scaling factor. + * @param[in] fixed_point_position Fixed-point position that expresses the number of bits for the fractional part of the number. + * @param[out] output_ptr Pointer to the output tensor. + */ + using MulFunctionQInt = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, int scale, int fixed_point_position); + /** Common signature for all the specialised multiplication functions with float scaling factor + * + * @param[in] input1_ptr Pointer to the first input tensor. + * @param[in] input2_ptr Pointer to the second input tensor. + * @param[out] output_ptr Pointer to the output tensor. + */ + using MulFunctionFloat = void(const void *__restrict input1_ptr, const void *__restrict input2_ptr, void *__restrict output_ptr, float scale); + + MulFunctionFloat *_func_float; + MulFunctionInt *_func_int; + MulFunctionQInt *_func_q_int; + +private: + const ITensor *_input1; + const ITensor *_input2; + ITensor *_output; + float _scale; + int _scale_exponent; +}; +} +#endif /*__ARM_COMPUTE_NEPIXELWISEMULTIPLICATIONKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h new file mode 100644 index 0000000000..62a087841a --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEPoolingLayerKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__ +#define __ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the pooling layer kernel */ +class NEPoolingLayerKernel : public INEKernel +{ +public: + /** Default constructor */ + NEPoolingLayerKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPoolingLayerKernel(const NEPoolingLayerKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEPoolingLayerKernel &operator=(const NEPoolingLayerKernel &) = delete; + /** Allow instances of this class to be moved */ + NEPoolingLayerKernel(NEPoolingLayerKernel &&) = default; + /** Allow instances of this class to be moved */ + NEPoolingLayerKernel &operator=(NEPoolingLayerKernel &&) = default; + /** Default destructor */ + ~NEPoolingLayerKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: QS8/F32. + * @param[out] output Destination tensor. Data types supported: Same as @p input. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + */ + void configure(const ITensor *input, ITensor *output, const PoolingLayerInfo &pool_info); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** Function to perform 2x2 pooling. + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void pooling2_f32(const Window &window_input, const Window &window); + /** Function to perform 2x2 pooling for 8bit fixed point. + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void pooling2_q8(const Window &window_input, const Window &window); + /** Function to perform 3x3 pooling. + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void pooling3_f32(const Window &window_input, const Window &window); + /** Function to perform 3x3 pooling for 8bit fixed point. + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + template + void pooling3_q8(const Window &window_input, const Window &window); + /** Common signature for all the specialised Pooling functions + * + * @param[in] window_input Input region on which to execute the kernel. + * @param[in] window Output region on which to execute the kernel. + */ + using PoolingFunction = void (NEPoolingLayerKernel::*)(const Window &window_input, const Window &window); + +private: + PoolingFunction _func; + const ITensor *_input; + ITensor *_output; + PoolingLayerInfo _pool_info; + int _num_elems_processed_per_iteration; + BorderSize _border_size; +}; +} +#endif /*__ARM_COMPUTE_NEPOOLINGLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NERemapKernel.h b/arm_compute/core/NEON/kernels/NERemapKernel.h new file mode 100644 index 0000000000..f9eae68ee8 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NERemapKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEREMAPKERNEL_H__ +#define __ARM_COMPUTE_NEREMAPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform a remap on a tensor */ +class NERemapKernel : public INEKernel +{ +public: + /** Default constructor */ + NERemapKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERemapKernel(const NERemapKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NERemapKernel &operator=(const NERemapKernel &) = delete; + /** Allow instances of this class to be moved */ + NERemapKernel(NERemapKernel &&) = default; + /** Allow instances of this class to be moved */ + NERemapKernel &operator=(NERemapKernel &&) = default; + /** Default destructor */ + ~NERemapKernel() = default; + + /** Initialize the kernel's input, output and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[in] map_x Map for X coordinates. Data type supported: F32. + * @param[in] map_y Map for Y coordinates. Data type supported: F32. + * @param[out] output Destination tensor. Data types supported: U8. All but the lowest two dimensions must be the same size as in the input tensor, i.e. remapping is only performed within the XY-plane. + * @param[in] policy The interpolation type. + */ + void configure(const ITensor *input, const ITensor *map_x, const ITensor *map_y, ITensor *output, InterpolationPolicy policy); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** function to perform nearest interpolation on the given window */ + void remap_nearest(const Window &window); + /** function to perform bilinear interpolation on the given window */ + void remap_bilinear(const Window &window); + /** Remap function to use for the particular interpolation type passed to configure() */ + void (NERemapKernel::*_func)(const Window &window); + + const ITensor *_input; /**< Input image */ + ITensor *_output; /**< Output image */ + const ITensor *_map_x; /**< Input remap x coordinates */ + const ITensor *_map_y; /**< Input remap y coordinates */ +}; +} +#endif /*__ARM_COMPUTE_NEREMAPKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEScaleKernel.h b/arm_compute/core/NEON/kernels/NEScaleKernel.h new file mode 100644 index 0000000000..03e26520b5 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEScaleKernel.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESCALEKERNEL_H__ +#define __ARM_COMPUTE_NESCALEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform scaling on a tensor */ +class NEScaleKernel : public INEKernel +{ +public: + /** Default constructor */ + NEScaleKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEScaleKernel(const NEScaleKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEScaleKernel &operator=(const NEScaleKernel &) = delete; + /** Allow instances of this class to be moved */ + NEScaleKernel(NEScaleKernel &&) = default; + /** Allow instances of this class to be moved */ + NEScaleKernel &operator=(NEScaleKernel &&) = default; + /** Default destructor */ + ~NEScaleKernel() = default; + + /** Initialise the kernel's inputs, output and interpolation policy + * + * @note dx, dy and offsets have the same dimensions (width and height) of the output tensor + * + * @param[in] input Source tensor. Data types supported: U8/S16. + * @param[in] dx Pixel's distance between the X real coordinate and the smallest X following integer. Data type supported: F32 + * @param[in] dy Pixel's distance between the Y real coordinate and the smallest Y following integer. Data type supported: F32 + * @param[in] offsets Offset to access the pixel with NEAREST interpolation or the top-left pixel with BILINEAR interpolation in the input tensor. Data type supported: S32. + * @param[out] output Destination tensor. Data types supported: U8/S16. All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] policy Interpolation type to use + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, const ITensor *dx, const ITensor *dy, const ITensor *offsets, ITensor *output, InterpolationPolicy policy, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + /** function to perform scale using nearest interpolation on the given window */ + void scale_nearest(const Window &window); + /** function to perform scale using bilinear interpolation on the given window */ + void scale_bilinear(const Window &window); + /** function to perform scale using area interpolation on the given window + * + * @note Used only in case down-sampling. + */ + void scale_area(const Window &window); + /** Scale function to use for the particular interpolation type passed to configure() */ + void (NEScaleKernel::*_func)(const Window &window); + + const ITensor *_offsets; + const ITensor *_dx; + const ITensor *_dy; + const ITensor *_input; + ITensor *_output; +}; +} +#endif /*__ARM_COMPUTE_NESCALEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h new file mode 100644 index 0000000000..c618456d49 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEScharr3x3Kernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESCHARR3x3KERNEL_H__ +#define __ARM_COMPUTE_NESCHARR3x3KERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to run a 3x3 Scharr filter on a tensor. + * +* @f[ +* \mathbf{G}_x=\begin{vmatrix} +* -3 & 0 & +3\\ +* -10& 0 & +10\\ +* -3 & 0 & +3 +* \end{vmatrix} +* @f] +*/ +class NEScharr3x3Kernel : public INEKernel +{ +public: + /** Default constructor */ + NEScharr3x3Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEScharr3x3Kernel(const NEScharr3x3Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEScharr3x3Kernel &operator=(const NEScharr3x3Kernel &) = delete; + /** Allow instances of this class to be moved */ + NEScharr3x3Kernel(NEScharr3x3Kernel &&) = default; + /** Allow instances of this class to be moved */ + NEScharr3x3Kernel &operator=(NEScharr3x3Kernel &&) = default; + /** Default destructor */ + ~NEScharr3x3Kernel() = default; + + /** Initialise the kernel's source, destination and border. + * + * @note At least one of output_x or output_y must be set. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output_x (Optional) Destination tensor for the X gradient. Data type supported: S16. + * @param[out] output_y (Optional) Destination tensor for the Y gradient. Data type supported: S16. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + bool _run_scharr_x; /**< Do we need to run Scharr X ? */ + bool _run_scharr_y; /**< Do we need to run Scharr Y ? */ + const ITensor *_input; /**< Input tensor */ + ITensor *_output_x; /**< Output tensor for scharr X */ + ITensor *_output_y; /**< Output tensor for scharr Y */ +}; +} +#endif /*__ARM_COMPUTE_NESCHARR3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h new file mode 100644 index 0000000000..246dd83573 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NESobel3x3Kernel.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESOBEL3x3KERNEL_H__ +#define __ARM_COMPUTE_NESOBEL3x3KERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to run a 3x3 Sobel X filter on a tensor. + * + * @f[ + * \mathbf{G}_x=\begin{vmatrix} + * -1 & 0 & +1\\ + * -2 & 0 & +2\\ + * -1 & 0 & +1 + * \end{vmatrix} + * @f] +*/ +class NESobel3x3Kernel : public INEKernel +{ +public: + /** Default constructor */ + NESobel3x3Kernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel3x3Kernel(const NESobel3x3Kernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel3x3Kernel &operator=(const NESobel3x3Kernel &) = delete; + /** Allow instances of this class to be moved */ + NESobel3x3Kernel(NESobel3x3Kernel &&) = default; + /** Allow instances of this class to be moved */ + NESobel3x3Kernel &operator=(NESobel3x3Kernel &&) = default; + /** Default destructor */ + ~NESobel3x3Kernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @note At least one of output_x or output_y must be set. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output_x (Optional) Destination tensor for the X gradient. Data type supported: S16. + * @param[out] output_y (Optional) Destination tensor for the Y gradient. Data type supported: S16. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + bool _run_sobel_x; /**< Do we need to run Sobel X ? */ + bool _run_sobel_y; /**< Do we need to run Sobel Y ? */ + const ITensor *_input; /**< Input tensor */ + ITensor *_output_x; /**< Output tensor for sobel X */ + ITensor *_output_y; /**< Output tensor for sobel Y */ +}; +} +#endif /*__ARM_COMPUTE_NESOBEL3x3KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h new file mode 100644 index 0000000000..49c1c41e6d --- /dev/null +++ b/arm_compute/core/NEON/kernels/NESobel5x5Kernel.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESOBEL5x5KERNEL_H__ +#define __ARM_COMPUTE_NESOBEL5x5KERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to run the horizontal pass of 5x5 Sobel filter on a tensor. + * + */ +class NESobel5x5HorKernel : public INEKernel +{ +public: + /** Default constructor */ + NESobel5x5HorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel5x5HorKernel(const NESobel5x5HorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel5x5HorKernel &operator=(const NESobel5x5HorKernel &) = delete; + /** Allow instances of this class to be moved */ + NESobel5x5HorKernel(NESobel5x5HorKernel &&) = default; + /** Allow instances of this class to be moved */ + NESobel5x5HorKernel &operator=(NESobel5x5HorKernel &&) = default; + /** Default destructor */ + ~NESobel5x5HorKernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @note At least one of output_x or output_y must be set + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output_x (Optional) Destination tensor for the X gradient. Data type supported: S16. + * @param[out] output_y (Optional) Destination tensor for the Y gradient. Data type supported: S16. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const ITensor *_input; /**< Input tensor */ + ITensor *_output_x; /**< X output of horizontal pass */ + ITensor *_output_y; /**< Y output of horizontal pass */ + bool _run_sobel_x; /**< Do we need to run Sobel X? */ + bool _run_sobel_y; /**< Do we need to run Sobel Y? */ + BorderSize _border_size; /**< Border size */ +}; + +/** Interface for the kernel to run the vertical pass of 5x5 Sobel Y filter on a tensor. + * +*/ +class NESobel5x5VertKernel : public INEKernel +{ +public: + /** Default constructor */ + NESobel5x5VertKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel5x5VertKernel(const NESobel5x5VertKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel5x5VertKernel &operator=(const NESobel5x5VertKernel &) = delete; + /** Allow instances of this class to be moved */ + NESobel5x5VertKernel(NESobel5x5VertKernel &&) = default; + /** Allow instances of this class to be moved */ + NESobel5x5VertKernel &operator=(NESobel5x5VertKernel &&) = default; + /** Default destructor */ + ~NESobel5x5VertKernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @param[in] input_x Input for X (X output of hor pass). Data type supported: S16. + * @param[in] input_y Input for Y (Y output of hor pass). Data type supported: S16. + * @param[out] output_x Destination tensor for the X gradient. Data type supported: S16. + * @param[out] output_y Destination tensor for the Y gradient. Data type supported: S16. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(ITensor *input_x, ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + ITensor *_input_x; /**< X input (X output of the hor pass) */ + ITensor *_input_y; /**< Y input (Y output of the hor pass) */ + ITensor *_output_x; /**< X output of sobel */ + ITensor *_output_y; /**< Y output of sobel */ + bool _run_sobel_x; /**< Do we need to run sobel X? */ + bool _run_sobel_y; /**< Do we need to run sobel Y? */ +}; +} +#endif /*__ARM_COMPUTE_NESOBEL5x5KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h new file mode 100644 index 0000000000..4bff8596b8 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NESobel7x7Kernel.h @@ -0,0 +1,122 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESOBEL7x7KERNEL_H__ +#define __ARM_COMPUTE_NESOBEL7x7KERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the kernel to run the horizontal pass of 7x7 Sobel filter on a tensor. + * + */ +class NESobel7x7HorKernel : public INEKernel +{ +public: + /** Default constructor */ + NESobel7x7HorKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel7x7HorKernel(const NESobel7x7HorKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel7x7HorKernel &operator=(const NESobel7x7HorKernel &) = delete; + /** Allow instances of this class to be moved */ + NESobel7x7HorKernel(NESobel7x7HorKernel &&) = default; + /** Allow instances of this class to be moved */ + NESobel7x7HorKernel &operator=(NESobel7x7HorKernel &&) = default; + /** Default destructor */ + ~NESobel7x7HorKernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @note At least one of output_x or output_y must be set. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output_x (Optional) Destination tensor for the X gradient. Data type supported: S32. + * @param[out] output_y (Optional) Destination tensor for the Y gradient. Data type supported: S32. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input, ITensor *output_x, ITensor *output_y, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const ITensor *_input; /**< Input tensor */ + ITensor *_output_x; /**< X output of horizontal pass */ + ITensor *_output_y; /**< Y output of horizontal pass */ + bool _run_sobel_x; /**< Do we need to run Sobel X? */ + bool _run_sobel_y; /**< Do we need to run Sobel Y? */ + BorderSize _border_size; /**< Border size */ +}; + +/** Interface for the kernel to run the vertical pass of 7x7 Sobel Y filter on a tensor. + * +*/ +class NESobel7x7VertKernel : public INEKernel +{ +public: + /** Default constructor */ + NESobel7x7VertKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel7x7VertKernel(const NESobel7x7VertKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NESobel7x7VertKernel &operator=(const NESobel7x7VertKernel &) = delete; + /** Allow instances of this class to be moved */ + NESobel7x7VertKernel(NESobel7x7VertKernel &&) = default; + /** Allow instances of this class to be moved */ + NESobel7x7VertKernel &operator=(NESobel7x7VertKernel &&) = default; + /** Default destructor */ + ~NESobel7x7VertKernel() = default; + + /** Initialise the kernel's source, destination and border mode. + * + * @note At least one of output_x or output_y must be set + * @note If output_x is set then input_x must be set too + * @note If output_y is set then input_y must be set too + * + * @param[in] input_x (Optional) Input for X (X output of hor pass). Data type supported: S32. + * @param[in] input_y (Optional) Input for Y (Y output of hor pass). Data type supported: S32. + * @param[out] output_x (Optional) Destination tensor for the X gradient. Data type supported: S32. + * @param[out] output_y (Optional) Destination tensor for the Y gradient. Data type supported: S32. + * @param[in] border_undefined True if the border mode is undefined. False if it's replicate or constant. + */ + void configure(const ITensor *input_x, const ITensor *input_y, ITensor *output_x, ITensor *output_y, bool border_undefined); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + const ITensor *_input_x; /**< X input (X output of the hor pass) */ + const ITensor *_input_y; /**< Y input (Y output of the hor pass) */ + ITensor *_output_x; /**< X output of sobel */ + ITensor *_output_y; /**< Y output of sobel */ + bool _run_sobel_x; /**< Do we need to run sobel X? */ + bool _run_sobel_y; /**< Do we need to run sobel Y? */ +}; +} +#endif /*__ARM_COMPUTE_NESOBEL7x7KERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h new file mode 100644 index 0000000000..ab626ad5ec --- /dev/null +++ b/arm_compute/core/NEON/kernels/NESoftmaxLayerKernel.h @@ -0,0 +1,135 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ +#define __ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** Interface for the identifying the max value of 1D Logits */ +class NELogits1DMaxKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NELogits1DMaxKernel(); + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: QS8, F32. + * @param[out] output Destination tensor. Data types supported: same as @p input + */ + void configure(const ITensor *input, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + BorderSize border_size() const override; + +private: + using Logits1DMaxFunction = void(const ITensor *in, ITensor *out, const Window &window); + +private: + Logits1DMaxFunction *_func; + BorderSize _border_size; +}; + +/** Interface for shifting the logits values around the max value and exponentiating the result */ +class NELogits1DShiftExpSumKernel : public INEKernel +{ +public: + /** Default constructor */ + NELogits1DShiftExpSumKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogits1DShiftExpSumKernel(const NELogits1DShiftExpSumKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogits1DShiftExpSumKernel &operator=(const NELogits1DShiftExpSumKernel &) = delete; + /** Allow instances of this class to be moved */ + NELogits1DShiftExpSumKernel(NELogits1DShiftExpSumKernel &&) = default; + /** Allow instances of this class to be moved */ + NELogits1DShiftExpSumKernel &operator=(NELogits1DShiftExpSumKernel &&) = default; + /** Default destructor */ + ~NELogits1DShiftExpSumKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: QS8, F32. + * @param[in] max Max values tensor. Data types supported: same as @p input. + * @param[out] output Destination tensor. Data types supported: same as @p input. + * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p input. + */ + void configure(const ITensor *input, const ITensor *max, ITensor *output, ITensor *sum); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + using Logits1DShiftExpSumFunction = void(const ITensor *in, const ITensor *max, ITensor *out, ITensor *sum, const Window &window); + +private: + Logits1DShiftExpSumFunction *_func; + const ITensor *_input; + const ITensor *_max; + ITensor *_output; + ITensor *_sum; +}; + +/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */ +class NELogits1DNormKernel : public INEKernel +{ +public: + /** Default constructor */ + NELogits1DNormKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogits1DNormKernel(const NELogits1DNormKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NELogits1DNormKernel &operator=(const NELogits1DNormKernel &) = delete; + /** Allow instances of this class to be moved */ + NELogits1DNormKernel(NELogits1DNormKernel &&) = default; + /** Allow instances of this class to be moved */ + NELogits1DNormKernel &operator=(NELogits1DNormKernel &&) = default; + /** Default destructor */ + ~NELogits1DNormKernel() = default; + /** Set the input and output tensors. + * + * @param[in] input Source tensor. Data types supported: QS8, F32. + * @param[in] sum Sum tensor. The number of dimensions should be dim(input)-1. Data types supported: same as @p input. + * @param[out] output Destination tensor. Data types supported: same as @p input. + */ + void configure(const ITensor *input, const ITensor *sum, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + using Logits1DNormFunction = void(const ITensor *in, const ITensor *sum, ITensor *out, const Window &window); + +private: + Logits1DNormFunction *_func; + const ITensor *_input; + const ITensor *_sum; + ITensor *_output; +}; +} +#endif /*__ARM_COMPUTE_NESOFTMAXLAYERKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NETableLookupKernel.h b/arm_compute/core/NEON/kernels/NETableLookupKernel.h new file mode 100644 index 0000000000..b3963e5a75 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NETableLookupKernel.h @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__ +#define __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__ + +#include "arm_compute/core/NEON/INESimpleKernel.h" + +namespace arm_compute +{ +class ITensor; +class ILut; + +/** Interface for the kernel to perform table lookup calculations. */ +class NETableLookupKernel : public INESimpleKernel +{ +public: + /** Default constructor */ + NETableLookupKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETableLookupKernel(const NETableLookupKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETableLookupKernel &operator=(const NETableLookupKernel &) = delete; + /** Allow instances of this class to be moved */ + NETableLookupKernel(NETableLookupKernel &&) = default; + /** Allow instances of this class to be moved */ + NETableLookupKernel &operator=(NETableLookupKernel &&) = default; + /** Initialise the kernel's input, lut and output. + * + * @param[in] input An input tensor. Data types supported: U8/S16. + * @param[in] lut The input LUT. + * @param[out] output The output tensor. Data types supported: same as @p input + */ + void configure(const ITensor *input, const ILut *lut, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Perform table lookup on a given window. + * + * @param window window Region on which to execute the kernel. + */ + template + void tableLookup(const Window &window); + /** Common signature for all the specialised lut functions + * + * @param[in] window Region on which to execute the kernel. + */ + using TableLookupFunction = void (NETableLookupKernel::*)(const Window &window); + /** Sub function to use for the particular tensor types passed to configure() */ + TableLookupFunction _func; + const ILut *_lut; +}; +} +#endif /* __ARM_COMPUTE_NETABLELOOKUPKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEThresholdKernel.h b/arm_compute/core/NEON/kernels/NEThresholdKernel.h new file mode 100644 index 0000000000..778176293f --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEThresholdKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NETHRESHOLDKERNEL_H__ +#define __ARM_COMPUTE_NETHRESHOLDKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** Interface for the thresholding kernel + * + */ +class NEThresholdKernel : public INEKernel +{ +public: + /** Constructor + * Initialize all the pointers to nullptr and parameters to zero. + */ + NEThresholdKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEThresholdKernel(const NEThresholdKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEThresholdKernel &operator=(const NEThresholdKernel &) = delete; + /** Initialise the kernel's input, output and threshold parameters. + * + * @param[in] input An input tensor. Data type supported: U8 + * @param[out] output The output tensor. Data type supported: U8. + * @param[in] threshold Threshold. When the threhold type is RANGE, this is used as the lower threshold. + * @param[in] false_value value to set when the condition is not respected. + * @param[in] true_value value to set when the condition is respected. + * @param[in] type Thresholding type. Either RANGE or BINARY. + * @param[in] upper Upper threshold. Only used when the thresholding type is RANGE. + */ + void configure(const ITensor *input, ITensor *output, uint8_t threshold, uint8_t false_value, uint8_t true_value, ThresholdType type, uint8_t upper); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** run binary thresholding on the given window */ + void run_binary(const Window &window); + /** run range thresholding on the given window */ + void run_range(const Window &window); + + void (NEThresholdKernel::*_func)(const Window &window); + + const ITensor *_input; /**< Input */ + ITensor *_output; /**< Output */ + uint8_t _threshold; + uint8_t _false_value; + uint8_t _true_value; + uint8_t _upper; +}; +} +#endif /*__ARM_COMPUTE_NETHRESHOLDKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NETransposeKernel.h b/arm_compute/core/NEON/kernels/NETransposeKernel.h new file mode 100644 index 0000000000..ac9449ff92 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NETransposeKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NETRANSPOSEKERNEL_H__ +#define __ARM_COMPUTE_NETRANSPOSEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel which transposes the elements of a matrix. + * + * [width, height, batch] -> [height, width, batch] + * + */ +class NETransposeKernel : public INEKernel +{ +public: + /** Default constructor */ + NETransposeKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeKernel(const NETransposeKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NETransposeKernel &operator=(const NETransposeKernel &) = delete; + /** Allow instances of this class to be moved */ + NETransposeKernel(NETransposeKernel &&) = default; + /** Allow instances of this class to be moved */ + NETransposeKernel &operator=(NETransposeKernel &&) = default; + /** Default destructor */ + ~NETransposeKernel() = default; + + /** Initialise the kernel's input and output. + * + * @param[in] input Input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[out] output Output tensor. Data type supported: Same as @p input + */ + void configure(const ITensor *input, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + /** Common signature for all the transpose functions + * + * @param[in] input An input tensor. Data types supported: U8/S8/QS8/U16/S16/F16/U32/S32/F32 + * @param[out] output The output tensor. Data type supported: same as @p input + * @param[in] window Region on which to execute the kernel. + */ + using TransposeFunction = void(const ITensor *input, ITensor *output, const Window &window); + /** Transpose function to use for the particular tensor types passed to configure() */ + TransposeFunction *_func; + const ITensor *_input; + ITensor *_output; +}; +} +#endif /* __ARM_COMPUTE_NETRANSPOSEKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEWarpKernel.h b/arm_compute/core/NEON/kernels/NEWarpKernel.h new file mode 100644 index 0000000000..10fed1d450 --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEWarpKernel.h @@ -0,0 +1,117 @@ +/* + * Copyright (c) 2016, 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEWARPKERNEL_H__ +#define __ARM_COMPUTE_NEWARPKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" +#include "arm_compute/core/Types.h" + +#include + +namespace arm_compute +{ +class ITensor; + +/** Common interface for warp affine and warp perspective */ +class INEWarpKernel : public INEKernel +{ +public: + /** Default constructor */ + INEWarpKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + INEWarpKernel(const INEWarpKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + INEWarpKernel &operator=(const INEWarpKernel &) = delete; + /** Allow instances of this class to be moved */ + INEWarpKernel(INEWarpKernel &&) = default; + /** Allow instances of this class to be moved */ + INEWarpKernel &operator=(INEWarpKernel &&) = default; + /** Initialise the kernel's input, output and border mode. + * + * @param[in] input Source tensor. Data type supported: U8. + * @param[out] output Destination tensor. Data type supported: U8. + * @param[in] matrix The perspective or affine matrix to use. Must be 2x3 for affine and 3x3 for perspective of type float. + * @param[in] border_mode Strategy to use for borders + * @param[in] constant_border_value Constant value used for filling the border. + */ + virtual void configure(const ITensor *input, ITensor *output, const float *matrix, BorderMode border_mode, uint8_t constant_border_value); + + // Inherited methods overridden: + void run(const Window &window) override; + +protected: + /** function to perform warp affine or warp perspective on the given window when border mode == UNDEFINED + * + * @param[in] window Region on which to execute the kernel + */ + virtual void warp_undefined(const Window &window) = 0; + /** function to perform warp affine or warp perspective on the given window when border mode == CONSTANT + * + * @param[in] window Region on which to execute the kernel + */ + virtual void warp_constant(const Window &window) = 0; + /** function to perform warp affine or warp perspective on the given window when border mode == REPLICATE + * + * @param[in] window Region on which to execute the kernel + */ + virtual void warp_replicate(const Window &window) = 0; + /** Common signature for all the specialised warp functions + * + * @param[in] window Region on which to execute the kernel. + */ + void (INEWarpKernel::*_func)(const Window &window); + + const ITensor *_input; /**< Input Tensor */ + ITensor *_output; /**< Output Tensor */ + uint8_t _constant_border_value; /**< Constant value used for filling the border. This value is used for those pixels out of the ROI when the border mode is CONSTANT */ + const float *_matrix; /**< The affine or perspective matrix. Must be 2x3 for warp affine or 3x3 for warp perspective of type float. */ +}; + +/** Template interface for the kernel to compute warp affine + * + */ +template +class NEWarpAffineKernel : public INEWarpKernel +{ +private: + // Inherited methods overridden: + void warp_undefined(const Window &window) override; + void warp_constant(const Window &window) override; + void warp_replicate(const Window &window) override; +}; + +/** Template interface for the kernel to compute warp perspective + * + */ +template +class NEWarpPerspectiveKernel : public INEWarpKernel +{ +private: + // Inherited methods overridden: + void warp_undefined(const Window &window) override; + void warp_constant(const Window &window) override; + void warp_replicate(const Window &window) override; +}; +} +#endif /*__ARM_COMPUTE_NEWARPKERNEL_H__ */ diff --git a/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h new file mode 100644 index 0000000000..cad2d00b1f --- /dev/null +++ b/arm_compute/core/NEON/kernels/NEWeightsReshapeKernel.h @@ -0,0 +1,94 @@ +/* + * Copyright (c) 2017 ARM Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ +#define __ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ + +#include "arm_compute/core/NEON/INEKernel.h" + +namespace arm_compute +{ +class ITensor; + +/** NEON kernel to perform reshaping on the weights used by convolution and locally connected layer + * + * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels. + * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication. + * + * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have: + * @f[ + * \left( \begin{array}{ccc} + * a000 & a001 & a002 \\ + * a010 & a011 & a012 \\ + * a020 & a021 & a022 \\ + * \end{array} \right) + * \left( \begin{array}{ccc} + * a100 & a101 & a102 \\ + * a110 & a111 & a112 \\ + * a120 & a121 & a122 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccc} + * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\ + * \end{array} \right) + * @f] + */ +class NEWeightsReshapeKernel : public INEKernel +{ +public: + /** Constructor.*/ + NEWeightsReshapeKernel(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete; + /** Allow instances of this class to be moved */ + NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default; + /** Allow instances of this class to be moved */ + NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default; + /** Default destructor */ + ~NEWeightsReshapeKernel() = default; + /** Set the input and output of the kernel. + * + * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: QS8/F32 + * @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with + * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input + * @param[out] output The output tensor. Data types supported: Same as @p input + */ + void configure(const ITensor *input, const ITensor *bias, ITensor *output); + + // Inherited methods overridden: + void run(const Window &window) override; + +private: + using WeightsReshapeKernel = void(const ITensor *input, const ITensor *bias, ITensor *output, const Window &window); + + WeightsReshapeKernel *_func; + const ITensor *_input; + const ITensor *_bias; + ITensor *_output; +}; +} + +#endif /*__ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H__ */ -- cgit v1.2.1