diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2020-10-02 16:38:59 +0100 |
---|---|---|
committer | Georgios Pinitas <georgios.pinitas@arm.com> | 2020-10-07 09:54:17 +0000 |
commit | ddb93bbf12fc9d685e7ddbef703a886d67cbda9b (patch) | |
tree | 6dc7bba4a3ffaa527f4972d85c951a012cce5231 /src/core/NEON/kernels/detail | |
parent | 4d91dc68adf8a4cc07285fe781469231230df3b9 (diff) | |
download | ComputeLibrary-ddb93bbf12fc9d685e7ddbef703a886d67cbda9b.tar.gz |
COMPMID-3637: Move wrapper to src
Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I524b0c4b49c7a7035b7d078b9585d77b0d438e10
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4083
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src/core/NEON/kernels/detail')
-rw-r--r-- | src/core/NEON/kernels/detail/NEActivationFunctionDetail.h | 315 | ||||
-rw-r--r-- | src/core/NEON/kernels/detail/NEColorConvertHelper.inl | 1045 | ||||
-rw-r--r-- | src/core/NEON/kernels/detail/NEDirectConvolution3x3.h | 170 | ||||
-rw-r--r-- | src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h | 965 |
4 files changed, 2495 insertions, 0 deletions
diff --git a/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h new file mode 100644 index 0000000000..eef1be06eb --- /dev/null +++ b/src/core/NEON/kernels/detail/NEActivationFunctionDetail.h @@ -0,0 +1,315 @@ +/* + * Copyright (c) 2018-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H +#define ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H + +#include "src/core/NEON/wrapper/wrapper.h" + +namespace arm_compute +{ +namespace detail +{ +/** Dummy activation object */ +template <typename T, int S> +struct dummy +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector<T, S>::type; + + /** Construct a dummy activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit dummy(ActivationLayerInfo act_info) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + ARM_COMPUTE_UNUSED(vval); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + ARM_COMPUTE_UNUSED(val); + } +}; +/** Linear activation object */ +template <typename T, int S> +struct linear +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector<T, S>::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + /** Construct a Linear activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit linear(ActivationLayerInfo act_info) + : alpha(act_info.a()), + beta(act_info.b()), + valpha(wrapper::vdup_n(static_cast<T>(alpha), ExactTagType{})), + vbeta(wrapper::vdup_n(static_cast<T>(beta), ExactTagType{})) + { + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmla(vbeta, vval, valpha); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = alpha * val + beta; + } + + const T alpha; /**< Scalar alpha */ + const T beta; /**< Scalar alpha */ + const ExactType valpha; /**< Vector of alphas. */ + const ExactType vbeta; /**< Vector of betas. */ +}; +/** Square activation object */ +template <typename T, int S> +struct square +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector<T, S>::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + /** Construct a Square activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit square(ActivationLayerInfo act_info) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmul(vval, vval); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = val * val; + } +}; +/** Logistic activation object */ +template <typename T, int S> +struct logistic +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector<T, S>::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + /** Construct a Logistic activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit logistic(ActivationLayerInfo act_info) + : vone(wrapper::vdup_n(static_cast<T>(1), ExactTagType{})) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vinv(wrapper::vadd(vone, wrapper::vexpq(wrapper::vneg(vval)))); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = 1 / (1 + std::exp(-val)); + } + + /** Vector of ones. */ + const ExactType vone; +}; +/** RELU activation object */ +template <typename T, int S> +struct relu +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector<T, S>::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + /** Construct a RELU activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit relu(ActivationLayerInfo act_info) + : vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})) + { + ARM_COMPUTE_UNUSED(act_info); + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmax(vzero, vval); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = std::max(static_cast<T>(0), val); + } + + /** Vector of zeroes. */ + const ExactType vzero; +}; +/** Bounded RELU activation object */ +template <typename T, int S> +struct brelu +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector<T, S>::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + /** Construct a bounded RELU activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit brelu(ActivationLayerInfo act_info) + : alpha(act_info.a()), + vzero(wrapper::vdup_n(static_cast<T>(0), ExactTagType{})), + valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})) + { + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmin(valpha, wrapper::vmax(vzero, vval)); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = std::min(alpha, std::max(static_cast<T>(0), val)); + } + + const T alpha; /** Scalar alpha */ + const ExactType vzero; /** Vector of zeroes. */ + const ExactType valpha; /** Vector of alphas. */ +}; +/** Lower-Upper Bounded RELU activation object */ +template <typename T, int S> +struct lubrelu +{ + /** NEON vector type. */ + using ExactType = typename wrapper::traits::neon_vector<T, S>::type; + /** NEON vector tag type. */ + using ExactTagType = typename wrapper::traits::neon_vector<T, S>::tag_type; + + /** Construct a lower-upper bounded RELU activation object. + * + * @param[in] act_info Activation layer information. + */ + explicit lubrelu(ActivationLayerInfo act_info) + : alpha(act_info.a()), + beta(act_info.b()), + valpha(wrapper::vdup_n(static_cast<T>(act_info.a()), ExactTagType{})), + vbeta(wrapper::vdup_n(static_cast<T>(act_info.b()), ExactTagType{})) + { + } + + /** Run activation function. + * + * @param[in] vval Vector of values. + */ + void operator()(ExactType &vval) + { + vval = wrapper::vmin(valpha, wrapper::vmax(vbeta, vval)); + } + + /** Run activation function. + * + * @param[in] val Scalar value. + */ + void operator()(T &val) + { + val = std::min(alpha, std::max(beta, val)); + } + + const T alpha; /**< Scalar alpha */ + const T beta; /**< Scalar alpha */ + const ExactType valpha; /** Vector of alphas. */ + const ExactType vbeta; /** Vector of betas. */ +}; +} // namespace detail +} // namespace arm_compute +#endif /* ARM_COMPUTE_DETAIL_NEACTIVATION_FUNCTION_DETAIL_H */ diff --git a/src/core/NEON/kernels/detail/NEColorConvertHelper.inl b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl new file mode 100644 index 0000000000..ac196d9dbb --- /dev/null +++ b/src/core/NEON/kernels/detail/NEColorConvertHelper.inl @@ -0,0 +1,1045 @@ +/* + * Copyright (c) 2016-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IMultiImage.h" +#include "arm_compute/core/Utils.h" +#include "src/core/NEON/NEMath.h" + +#include <arm_neon.h> + +namespace +{ +#ifndef DOXYGEN_SKIP_THIS +constexpr float red_coef_bt709 = 1.5748F; +constexpr float green_coef_bt709 = -0.1873f; +constexpr float green_coef2_bt709 = -0.4681f; +constexpr float blue_coef_bt709 = 1.8556f; + +constexpr float rgb2yuv_bt709_kr = 0.2126f; +constexpr float rgb2yuv_bt709_kb = 0.0722f; +// K_g = 1 - K_r - K_b +constexpr float rgb2yuv_bt709_kg = 0.7152f; +// C_u = 1 / (2 * (1 - K_b)) +constexpr float rgb2yuv_bt709_cu = 0.5389f; +// C_v = 1 / (2 * (1 - K_r)) +constexpr float rgb2yuv_bt709_cv = 0.6350f; + +constexpr float rgb2u8_red_coef = 0.2126f; +constexpr float rgb2u8_green_coef = 0.7152f; +constexpr float rgb2u8_blue_coef = 0.0722f; + +inline float32x4_t rgb_to_greyscale_calculation(const float32x4_t &rcolor, const float32x4_t &gcolor, const float32x4_t &bcolor, + const float rcoef, const float gcoef, const float bcoef) +{ + float32x4_t greyscale = vmulq_n_f32(rcolor, rcoef); + greyscale = vmlaq_n_f32(greyscale, gcolor, gcoef); + greyscale = vmlaq_n_f32(greyscale, bcolor, bcoef); + return greyscale; +} + +inline void rgb_to_u8_conversion(const uint8x16x3_t &in, uint8x16_t &out) +{ + float32x4x4_t out_float32; + + //Conversion from 3(RGB) 4 uint8s to 3(RGB) 4 floats + const float32x4x4_t r_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[0]); + const float32x4x4_t g_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[1]); + const float32x4x4_t b_float32 = arm_compute::convert_uint8x16_to_float32x4x4(in.val[2]); + + //New grayscale image = ( (RED_COEFF * R) + (GREEN_COEFF * G) + (BLUE_COEFF * B) ) + //Computation of 1(Greyscale) 4 uint8 using 3(RGB) 4 uint8s float + out_float32.val[0] = rgb_to_greyscale_calculation(r_float32.val[0], g_float32.val[0], b_float32.val[0], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + out_float32.val[1] = rgb_to_greyscale_calculation(r_float32.val[1], g_float32.val[1], b_float32.val[1], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + out_float32.val[2] = rgb_to_greyscale_calculation(r_float32.val[2], g_float32.val[2], b_float32.val[2], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + out_float32.val[3] = rgb_to_greyscale_calculation(r_float32.val[3], g_float32.val[3], b_float32.val[3], + rgb2u8_red_coef, rgb2u8_green_coef, rgb2u8_blue_coef); + + //Conversion from 1(Greyscale) 4 floats to 1(Greyscale) 4 uint8s + arm_compute::convert_float32x4x4_to_uint8x16(out_float32, out); +} + +inline void rgb_to_yuv_calculation(const float32x4_t &rvec, const float32x4_t &gvec, const float32x4_t &bvec, + float32x4_t &yvec, float32x4_t &uvec, float32x4_t &vvec) +{ + /* + Y'= 0.2126*R' + 0.7152*G' + 0.0722*B' + U'=-0.1146*R' - 0.3854*G' + 0.5000*B' + V'= 0.5000*R' - 0.4542*G' - 0.0458*B' + */ + const auto c128 = vdupq_n_f32(128.f); + + // Y = R * K_r + G * (1 - K_r - K_b) * B * K_b + yvec = vmulq_n_f32(rvec, rgb2yuv_bt709_kr); + yvec = vmlaq_n_f32(yvec, gvec, rgb2yuv_bt709_kg); + yvec = vmlaq_n_f32(yvec, bvec, rgb2yuv_bt709_kb); + + // U = (B - Y) / (2 * (1 - K_b)) + uvec = vsubq_f32(bvec, yvec); + uvec = vmlaq_n_f32(c128, uvec, rgb2yuv_bt709_cu); + + // V = (R - Y) / (2 * (1 - K_r)) + vvec = vsubq_f32(rvec, yvec); + vvec = vmlaq_n_f32(c128, vvec, rgb2yuv_bt709_cv); +} + +inline void yuyv_to_rgb_calculation(const float32x4_t &yvec_val, float32x4_t uvec_val, const float32x4_t &yyvec_val, + float32x4_t vvec_val, unsigned char *output_ptr, const bool alpha) +{ + float32x4x3_t rgb1, rgb2; + + // Compute: cb - 128 and cr - 128; + const auto c128 = vdupq_n_f32(128.f); + uvec_val = vsubq_f32(uvec_val, c128); + vvec_val = vsubq_f32(vvec_val, c128); + + // Compute: + // r = 0.0000f*f_u + 1.5748f*f_v; + // g = 0.1873f*f_u - 0.4681f*f_v; + // b = 1.8556f*f_u + 0.0000f*f_v; + const auto red = vmulq_n_f32(vvec_val, red_coef_bt709); + const auto blue = vmulq_n_f32(uvec_val, blue_coef_bt709); + const auto green = vaddq_f32(vmulq_n_f32(uvec_val, green_coef_bt709), + vmulq_n_f32(vvec_val, green_coef2_bt709)); + + // Compute the final r,g,b values using y1 for the first texel and y2 for the second one. + // the result is stored in two float32x4x3_t which then are converted to one uint8x8x3_t + // and written back to memory using vst3 instruction + + rgb1.val[0] = vaddq_f32(yvec_val, red); + rgb1.val[1] = vaddq_f32(yvec_val, green); + rgb1.val[2] = vaddq_f32(yvec_val, blue); + + rgb2.val[0] = vaddq_f32(yyvec_val, red); + rgb2.val[1] = vaddq_f32(yyvec_val, green); + rgb2.val[2] = vaddq_f32(yyvec_val, blue); + + uint8x8x3_t u8_rgb; + arm_compute::convert_float32x4x3_to_uint8x8x3(rgb1, rgb2, u8_rgb); + + if(!alpha) + { + vst3_lane_u8(&output_ptr[0], u8_rgb, 0); + vst3_lane_u8(&output_ptr[3], u8_rgb, 4); + vst3_lane_u8(&output_ptr[6], u8_rgb, 1); + vst3_lane_u8(&output_ptr[9], u8_rgb, 5); + vst3_lane_u8(&output_ptr[12], u8_rgb, 2); + vst3_lane_u8(&output_ptr[15], u8_rgb, 6); + vst3_lane_u8(&output_ptr[18], u8_rgb, 3); + vst3_lane_u8(&output_ptr[21], u8_rgb, 7); + } + else + { + uint8x8x4_t u8_rgba; + u8_rgba.val[0] = u8_rgb.val[0]; + u8_rgba.val[1] = u8_rgb.val[1]; + u8_rgba.val[2] = u8_rgb.val[2]; + u8_rgba.val[3] = vdup_n_u8(255); + vst4_lane_u8(&output_ptr[0], u8_rgba, 0); + vst4_lane_u8(&output_ptr[4], u8_rgba, 4); + vst4_lane_u8(&output_ptr[8], u8_rgba, 1); + vst4_lane_u8(&output_ptr[12], u8_rgba, 5); + vst4_lane_u8(&output_ptr[16], u8_rgba, 2); + vst4_lane_u8(&output_ptr[20], u8_rgba, 6); + vst4_lane_u8(&output_ptr[24], u8_rgba, 3); + vst4_lane_u8(&output_ptr[28], u8_rgba, 7); + } +} + +inline uint8x16x3_t load_rgb(const unsigned char *const ptr, const bool alpha) +{ + uint8x16x3_t rgb; + + if(alpha) + { + const auto tmp = vld4q_u8(ptr); + rgb.val[0] = tmp.val[0]; + rgb.val[1] = tmp.val[1]; + rgb.val[2] = tmp.val[2]; + } + else + { + rgb = vld3q_u8(ptr); + } + + return rgb; +} + +inline void rgb_to_yuv_conversion(uint8x16x3_t &vec_top, uint8x16x3_t &vec_bottom) +{ + // Convert the uint8x16_t to float32x4x4_t + const float32x4x4_t frvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[0]); + const float32x4x4_t fgvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[1]); + const float32x4x4_t fbvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vec_top.val[2]); + + const float32x4x4_t frvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[0]); + const float32x4x4_t fgvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[1]); + const float32x4x4_t fbvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vec_bottom.val[2]); + + float32x4x4_t fyvec_top, fuvec_top, fvvec_top; + float32x4x4_t fyvec_bottom, fuvec_bottom, fvvec_bottom; + + for(auto i = 0; i < 4; ++i) + { + rgb_to_yuv_calculation(frvec_top.val[i], fgvec_top.val[i], fbvec_top.val[i], + fyvec_top.val[i], fuvec_top.val[i], fvvec_top.val[i]); + rgb_to_yuv_calculation(frvec_bottom.val[i], fgvec_bottom.val[i], fbvec_bottom.val[i], + fyvec_bottom.val[i], fuvec_bottom.val[i], fvvec_bottom.val[i]); + } + + arm_compute::convert_float32x4x4_to_uint8x16(fyvec_top, vec_top.val[0]); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec_top, vec_top.val[1]); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec_top, vec_top.val[2]); + arm_compute::convert_float32x4x4_to_uint8x16(fyvec_bottom, vec_bottom.val[0]); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec_bottom, vec_bottom.val[1]); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec_bottom, vec_bottom.val[2]); +} + +inline void store_rgb_to_nv12(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, + unsigned char *const __restrict out_uv) +{ + uint8x16x3_t vec_top, vec_bottom; + vec_top.val[0] = rvec_top; + vec_top.val[1] = gvec_top; + vec_top.val[2] = bvec_top; + vec_bottom.val[0] = rvec_bottom; + vec_bottom.val[1] = gvec_bottom; + vec_bottom.val[2] = bvec_bottom; + + rgb_to_yuv_conversion(vec_top, vec_bottom); + + vst1q_u8(out_y_top, vec_top.val[0]); + vst1q_u8(out_y_bottom, vec_bottom.val[0]); + + const auto uvec = vuzpq_u8(vec_top.val[1], vec_bottom.val[1]); + const auto vvec = vuzpq_u8(vec_top.val[2], vec_bottom.val[2]); + const auto utmp = vrhaddq_u8(uvec.val[0], uvec.val[1]); + const auto vtmp = vrhaddq_u8(vvec.val[0], vvec.val[1]); + + uint8x8x2_t uvvec; + uvvec.val[0] = vhadd_u8(vget_low_u8(utmp), vget_high_u8(utmp)); + uvvec.val[1] = vhadd_u8(vget_low_u8(vtmp), vget_high_u8(vtmp)); + + vst2_u8(out_uv, uvvec); +} + +inline void store_rgb_to_iyuv(const uint8x16_t &rvec_top, const uint8x16_t &gvec_top, const uint8x16_t &bvec_top, + const uint8x16_t &rvec_bottom, const uint8x16_t &gvec_bottom, const uint8x16_t &bvec_bottom, + unsigned char *const __restrict out_y_top, unsigned char *const __restrict out_y_bottom, + unsigned char *const __restrict out_u, + unsigned char *const __restrict out_v) +{ + uint8x16x3_t vec_top, vec_bottom; + vec_top.val[0] = rvec_top; + vec_top.val[1] = gvec_top; + vec_top.val[2] = bvec_top; + vec_bottom.val[0] = rvec_bottom; + vec_bottom.val[1] = gvec_bottom; + vec_bottom.val[2] = bvec_bottom; + + rgb_to_yuv_conversion(vec_top, vec_bottom); + + vst1q_u8(out_y_top, vec_top.val[0]); + vst1q_u8(out_y_bottom, vec_bottom.val[0]); + + const auto uvvec_top = vuzpq_u8(vec_top.val[1], vec_top.val[2]); + const auto uvvec_bottom = vuzpq_u8(vec_bottom.val[1], vec_bottom.val[2]); + const auto uvvec = vhaddq_u8(vrhaddq_u8(uvvec_top.val[0], uvvec_top.val[1]), + vrhaddq_u8(uvvec_bottom.val[0], uvvec_bottom.val[1])); + + vst1_u8(out_u, vget_low_u8(uvvec)); + vst1_u8(out_v, vget_high_u8(uvvec)); +} + +inline void store_rgb_to_yuv4(const uint8x16_t &rvec, const uint8x16_t &gvec, const uint8x16_t &bvec, + unsigned char *const __restrict out_y, + unsigned char *const __restrict out_u, + unsigned char *const __restrict out_v) +{ + // Convert the uint8x16_t to float32x4x4_t + const float32x4x4_t frvec = arm_compute::convert_uint8x16_to_float32x4x4(rvec); + const float32x4x4_t fgvec = arm_compute::convert_uint8x16_to_float32x4x4(gvec); + const float32x4x4_t fbvec = arm_compute::convert_uint8x16_to_float32x4x4(bvec); + + float32x4x4_t fyvec, fuvec, fvvec; + for(auto i = 0; i < 4; ++i) + { + rgb_to_yuv_calculation(frvec.val[i], fgvec.val[i], fbvec.val[i], + fyvec.val[i], fuvec.val[i], fvvec.val[i]); + } + + uint8x16_t yvec, uvec, vvec; + arm_compute::convert_float32x4x4_to_uint8x16(fyvec, yvec); + arm_compute::convert_float32x4x4_to_uint8x16(fuvec, uvec); + arm_compute::convert_float32x4x4_to_uint8x16(fvvec, vvec); + + vst1q_u8(out_y, yvec); + vst1q_u8(out_u, uvec); + vst1q_u8(out_v, vvec); +} +#endif /* DOXYGEN_SKIP_THIS */ +} + +namespace arm_compute +{ +/** Convert RGB to RGBX. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output RGBX buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_rgb_to_rgbx(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IImage *__restrict>(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16x4_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + ta2.val[3] = vdupq_n_u8(255); + vst4q_u8(out.ptr(), ta2); + }, + in, out); +} + +/** Convert RGB to U8. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output U8 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_rgb_to_u8(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IImage *__restrict>(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta1 = vld3q_u8(in.ptr()); + uint8x16_t ta2; + rgb_to_u8_conversion(ta1, ta2); + vst1q_u8(out.ptr(), ta2); + }, + in, out); +} + +/** Convert RGBX to RGB. + * + * @param[in] input Input RGBX data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_rgbx_to_rgb(const void *input, void *output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IImage *__restrict>(output); + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta1 = vld4q_u8(in.ptr()); + uint8x16x3_t ta2; + ta2.val[0] = ta1.val[0]; + ta2.val[1] = ta1.val[1]; + ta2.val[2] = ta1.val[2]; + vst3q_u8(out.ptr(), ta2); + }, + in, out); +} + +/** Convert YUYV to RGB. + * + * @param[in] input Input YUYV data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool yuyv, bool alpha> +void colorconvert_yuyv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IImage *__restrict>(output); + + constexpr auto element_size = alpha ? 32 : 24; + constexpr auto shift = yuyv ? 0 : 1; + + Iterator in(input_ptr, win); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta = vld4q_u8(in.ptr()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + // Convert the uint8x16x4_t to float32x4x4_t + const float32x4x4_t yvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[0 + shift]); + const float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[1 - shift]); + const float32x4x4_t yyvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[2 + shift]); + const float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta.val[3 - shift]); + + yuyv_to_rgb_calculation(yvec.val[0], uvec.val[0], yyvec.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[1], uvec.val[1], yyvec.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[2], uvec.val[2], yyvec.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec.val[3], uvec.val[3], yyvec.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + }, + in, out); +} + +/** Convert NV12 to RGB. + * + * @param[in] input Input NV12 data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool uv, bool alpha> +void colorconvert_nv12_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); + const auto output_ptr = static_cast<IImage *__restrict>(output); + + constexpr auto element_size = alpha ? 32 : 24; + const auto out_stride = output_ptr->info()->strides_in_bytes().y(); + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[0 + shift]); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_uv.val[1 - shift]); + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_uv, out); +} + +/** Convert IYUV to RGB. + * + * @param[in] input Input IYUV data buffer. + * @param[out] output Output RGB buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool alpha> +void colorconvert_iyuv_to_rgb(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); + const auto output_ptr = static_cast<IImage *__restrict>(output); + + constexpr auto element_size = alpha ? 32 : 24; + const auto out_stride = output_ptr->info()->strides_in_bytes().y(); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out(output_ptr, win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto *y_top_ptr = in_y.ptr(); + const auto *y_bottom_ptr = in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y(); + const auto *u_ptr = in_u.ptr(); + const auto *v_ptr = in_v.ptr(); + + // Work-around issue in gcc 9(>=) where vld2q might cause issues with register allocation +#if defined(__arch64__) + const auto ta0_y_top = vld1q_u8(y_top_ptr); + const auto ta1_y_top = vld1q_u8(y_top_ptr + 16); + const auto ta0_y_bottom = vld1q_u8(y_bottom_ptr); + const auto ta1_y_bottom = vld1q_u8(y_bottom_ptr + 16); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_top, ta1_y_top)); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp1q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(vuzp2q_u8(ta0_y_bottom, ta1_y_bottom)); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); +#else /* defined(__arch64__) */ + const auto ta_y_top = vld2q_u8(y_top_ptr); + const auto ta_y_bottom = vld2q_u8(y_bottom_ptr); + const auto ta_u = vld1q_u8(u_ptr); + const auto ta_v = vld1q_u8(v_ptr); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u.val[0] = U0 U2 U4 U6 ... + //ta_v.val[0] = V0 V2 V4 V6 ... + + // Convert the uint8x16x4_t to float32x4x4_t + float32x4x4_t yvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[0]); + float32x4x4_t yyvec_top = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_top.val[1]); + float32x4x4_t yvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[0]); + float32x4x4_t yyvec_bottom = arm_compute::convert_uint8x16_to_float32x4x4(ta_y_bottom.val[1]); + float32x4x4_t uvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_u); + float32x4x4_t vvec = arm_compute::convert_uint8x16_to_float32x4x4(ta_v); +#endif /* defined(__arch64__) */ + + yuyv_to_rgb_calculation(yvec_top.val[0], uvec.val[0], yyvec_top.val[0], vvec.val[0], out.ptr() + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[1], uvec.val[1], yyvec_top.val[1], vvec.val[1], out.ptr() + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[2], uvec.val[2], yyvec_top.val[2], vvec.val[2], out.ptr() + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_top.val[3], uvec.val[3], yyvec_top.val[3], vvec.val[3], out.ptr() + 3 * element_size, alpha); + + yuyv_to_rgb_calculation(yvec_bottom.val[0], uvec.val[0], yyvec_bottom.val[0], vvec.val[0], out.ptr() + out_stride + 0 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[1], uvec.val[1], yyvec_bottom.val[1], vvec.val[1], out.ptr() + out_stride + 1 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[2], uvec.val[2], yyvec_bottom.val[2], vvec.val[2], out.ptr() + out_stride + 2 * element_size, alpha); + yuyv_to_rgb_calculation(yvec_bottom.val[3], uvec.val[3], yyvec_bottom.val[3], vvec.val[3], out.ptr() + out_stride + 3 * element_size, alpha); + }, + in_y, in_u, in_v, out); +} + +/** Convert YUYV to NV12. + * + * @param[in] input Input YUYV data buffer. + * @param[out] output Output NV12 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool yuyv> +void colorconvert_yuyv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + constexpr auto shift = yuyv ? 0 : 1; + + // NV12's UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16x2_t uvvec; + uvvec.val[0] = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + uvvec.val[1] = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst2q_u8(out_uv.ptr(), uvvec); + }, + in, out_y, out_uv); +} + +/** Convert IYUV to NV12. + * + * @param[in] input Input IYUV data buffer. + * @param[out] output Output NV12 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_iyuv_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + uint8x16x2_t ta_uv; + ta_uv.val[0] = vld1q_u8(in_u.ptr()); + ta_uv.val[1] = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst2q_u8(out_uv.ptr(), ta_uv); + }, + in_y, in_u, in_v, out_y, out_uv); +} + +/** Convert NV12 to IYUV. + * + * @param[in] input Input NV12 data buffer. + * @param[out] output Output IYUV buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool uv> +void colorconvert_nv12_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + vst1q_u8(out_u.ptr(), ta_uv.val[0 + shift]); + vst1q_u8(out_v.ptr(), ta_uv.val[1 - shift]); + }, + in_y, in_uv, out_y, out_u, out_v); +} + +/** Convert YUYV to IYUV. + * + * @param[in] input Input YUYV data buffer. + * @param[out] output Output IYUV buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool yuyv> +void colorconvert_yuyv_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + constexpr auto shift = yuyv ? 0 : 1; + + // Destination's UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_top = vld4q_u8(in.ptr()); + const auto ta_bottom = vld4q_u8(in.ptr() + input_ptr->info()->strides_in_bytes().y()); + //ta.val[0] = Y0 Y2 Y4 Y6 ... + //ta.val[1] = U0 U2 U4 U6 ... + //ta.val[2] = Y1 Y3 Y5 Y7 ... + //ta.val[3] = V0 V2 V4 V7 ... + + uint8x16x2_t yvec; + yvec.val[0] = ta_top.val[0 + shift]; + yvec.val[1] = ta_top.val[2 + shift]; + vst2q_u8(out_y.ptr(), yvec); + + uint8x16x2_t yyvec; + yyvec.val[0] = ta_bottom.val[0 + shift]; + yyvec.val[1] = ta_bottom.val[2 + shift]; + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), yyvec); + + uint8x16_t uvec; + uvec = vhaddq_u8(ta_top.val[1 - shift], ta_bottom.val[1 - shift]); + vst1q_u8(out_u.ptr(), uvec); + + uint8x16_t vvec; + vvec = vhaddq_u8(ta_top.val[3 - shift], ta_bottom.val[3 - shift]); + vst1q_u8(out_v.ptr(), vvec); + }, + in, out_y, out_u, out_v); +} + +/** Convert NV12 to YUV4. + * + * @param[in] input Input NV12 data buffer. + * @param[out] output Output YUV4 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool uv> +void colorconvert_nv12_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + constexpr auto shift = uv ? 0 : 1; + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_uv(input_ptr->plane(1), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_uv = vld2q_u8(in_uv.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_uv.val[0] = U0 U2 U4 U6 ... + //ta_uv.val[1] = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_uv.val[0 + shift]; + uvec.val[1] = ta_uv.val[0 + shift]; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_uv.val[1 - shift]; + vvec.val[1] = ta_uv.val[1 - shift]; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_uv, out_y, out_u, out_v); +} + +/** Convert IYUV to YUV4. + * + * @param[in] input Input IYUV data buffer. + * @param[out] output Output YUV4 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +void colorconvert_iyuv_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IMultiImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in_y(input_ptr->plane(0), win); + Iterator in_u(input_ptr->plane(1), win_uv); + Iterator in_v(input_ptr->plane(2), win_uv); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_y_top = vld2q_u8(in_y.ptr()); + const auto ta_y_bottom = vld2q_u8(in_y.ptr() + input_ptr->plane(0)->info()->strides_in_bytes().y()); + const auto ta_u = vld1q_u8(in_u.ptr()); + const auto ta_v = vld1q_u8(in_v.ptr()); + //ta_y.val[0] = Y0 Y2 Y4 Y6 ... + //ta_y.val[1] = Y1 Y3 Y5 Y7 ... + //ta_u = U0 U2 U4 U6 ... + //ta_v = V0 V2 V4 V6 ... + + vst2q_u8(out_y.ptr(), ta_y_top); + vst2q_u8(out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), ta_y_bottom); + + uint8x16x2_t uvec; + uvec.val[0] = ta_u; + uvec.val[1] = ta_u; + vst2q_u8(out_u.ptr(), uvec); + vst2q_u8(out_u.ptr() + output_ptr->plane(1)->info()->strides_in_bytes().y(), uvec); + + uint8x16x2_t vvec; + vvec.val[0] = ta_v; + vvec.val[1] = ta_v; + vst2q_u8(out_v.ptr(), vvec); + vst2q_u8(out_v.ptr() + output_ptr->plane(2)->info()->strides_in_bytes().y(), vvec); + }, + in_y, in_u, in_v, out_y, out_u, out_v); +} + +/** Convert RGB to NV12. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output NV12 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool alpha> +void colorconvert_rgb_to_nv12(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_uv(output_ptr->plane(1), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_nv12(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], + ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], + out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), + out_uv.ptr()); + }, + in, out_y, out_uv); +} + +/** Convert RGB to IYUV. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output IYUV buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool alpha> +void colorconvert_rgb_to_iyuv(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + // UV's width and height are subsampled + Window win_uv(win); + win_uv.set(Window::DimX, Window::Dimension(win_uv.x().start() / 2, win_uv.x().end() / 2, win_uv.x().step() / 2)); + win_uv.set(Window::DimY, Window::Dimension(win_uv.y().start() / 2, win_uv.y().end() / 2, 1)); + win_uv.validate(); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win_uv); + Iterator out_v(output_ptr->plane(2), win_uv); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_rgb_top = load_rgb(in.ptr(), alpha); + const auto ta_rgb_bottom = load_rgb(in.ptr() + input_ptr->info()->strides_in_bytes().y(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_iyuv(ta_rgb_top.val[0], ta_rgb_top.val[1], ta_rgb_top.val[2], + ta_rgb_bottom.val[0], ta_rgb_bottom.val[1], ta_rgb_bottom.val[2], + out_y.ptr(), out_y.ptr() + output_ptr->plane(0)->info()->strides_in_bytes().y(), + out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); +} + +/** Convert RGB to YUV4. + * + * @param[in] input Input RGB data buffer. + * @param[out] output Output YUV4 buffer. + * @param[in] win Window for iterating the buffers. + * + */ +template <bool alpha> +void colorconvert_rgb_to_yuv4(const void *__restrict input, void *__restrict output, const Window &win) +{ + ARM_COMPUTE_ERROR_ON(nullptr == input); + ARM_COMPUTE_ERROR_ON(nullptr == output); + win.validate(); + + const auto input_ptr = static_cast<const IImage *__restrict>(input); + const auto output_ptr = static_cast<IMultiImage *__restrict>(output); + + Iterator in(input_ptr, win); + Iterator out_y(output_ptr->plane(0), win); + Iterator out_u(output_ptr->plane(1), win); + Iterator out_v(output_ptr->plane(2), win); + + execute_window_loop(win, [&](const Coordinates &) + { + const auto ta_rgb = load_rgb(in.ptr(), alpha); + //ta_rgb.val[0] = R0 R1 R2 R3 ... + //ta_rgb.val[1] = G0 G1 G2 G3 ... + //ta_rgb.val[2] = B0 B1 B2 B3 ... + + store_rgb_to_yuv4(ta_rgb.val[0], ta_rgb.val[1], ta_rgb.val[2], + out_y.ptr(), out_u.ptr(), out_v.ptr()); + }, + in, out_y, out_u, out_v); +} +} // namespace arm_compute diff --git a/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h new file mode 100644 index 0000000000..96defbc9c9 --- /dev/null +++ b/src/core/NEON/kernels/detail/NEDirectConvolution3x3.h @@ -0,0 +1,170 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H +#define ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace detail +{ +inline float32x4x3_t load_matrix_row(const float *ptr) +{ + const float32x4x3_t r = + { + { + vld1q_dup_f32(ptr), + vld1q_dup_f32(1 + ptr), + vld1q_dup_f32(2 + ptr) + } + }; + return r; +} + +template <unsigned int stridex> +float32x4x2_t convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2); + +template <> +inline float32x4x2_t convolve_3x3<1>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +{ + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + 4), + vld1q_f32(in_top + 8) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + 4), + vld1q_f32(in_mid + 8) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + 4), + vld1q_f32(in_low + 8) + } + }; + float32x4x2_t out = + { + { + vmulq_f32(vtop.val[0], m0.val[0]), + vmulq_f32(vtop.val[1], m0.val[0]) + } + }; + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<2>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + return out; +} + +template <> +inline float32x4x2_t convolve_3x3<3>(const float *in_top, const float *in_mid, const float *in_low, const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2) +{ + float32x4x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + return out; +} + +template <unsigned int stridex> +void store_results(float *buffer, const float32x4x2_t &values); + +template <> +void store_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); + vst1q_f32(buffer + 4, values.val[1]); +} + +template <> +void store_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); +} + +template <> +void store_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vget_low_f32(values.val[0])); +} + +template <unsigned int stridex> +int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration); + +template <> +int get_input_num_elems_processed<1>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration; +} + +template <> +int get_input_num_elems_processed<2>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration << 1; +} + +template <> +int get_input_num_elems_processed<3>(unsigned int num_elems_written_per_iteration) +{ + return num_elems_written_per_iteration * 3; +} +} +} // namespace arm_compute +#endif /* ARM_COMPUTE_NECONVOLUTIONKERNEL3x3_H */
\ No newline at end of file diff --git a/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h new file mode 100644 index 0000000000..d7ee70a1cd --- /dev/null +++ b/src/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -0,0 +1,965 @@ +/* + * Copyright (c) 2017-2020 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H +#define ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H + +#include "arm_compute/core/AccessWindowStatic.h" +#include "arm_compute/core/utils/misc/Requires.h" +#include "src/core/NEON/NEFixedPoint.h" +#include "src/core/NEON/wrapper/wrapper.h" + +#include <arm_neon.h> + +namespace arm_compute +{ +namespace detail +{ +/** Loads a 3x3 matrix as a row (float). + * + * @param[in] ptr Pointer to a float 3x3 matrix. + * @param[in] weights_offset (Optional) Weights quantization offset. + * + * @return The loaded matrix. + */ +inline float32x4x3_t load_matrix_row(const float *ptr, int weights_offset = 0) +{ + ARM_COMPUTE_UNUSED(weights_offset); + const float32x4x3_t r = + { + { + vld1q_dup_f32(ptr), + vld1q_dup_f32(1 + ptr), + vld1q_dup_f32(2 + ptr) + } + }; + return r; +} + +/** Loads a 3x3 matrix as a row (uint8_t/int8_t). + * + * @param[in] ptr Pointer to a uint8_t/int8_t 3x3 matrix. + * @param[in] weights_offset (Optional) Weights quantization offset. + * + * @return The loaded matrix. + */ +template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) > +inline int32x4x3_t load_matrix_row(const T *ptr, int weights_offset = 0) +{ + const int32x4_t v_weights_offset = vdupq_n_s32(weights_offset); + + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + int32x4x3_t r = + { + { + vaddq_s32(v_weights_offset, vdupq_n_s32(*ptr)), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 1))), + vaddq_s32(v_weights_offset, vdupq_n_s32(*(ptr + 2))) + } + }; + return r; +} + +/** Stores a float32x4x2_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template <unsigned int stridex> +void store_results(float *buffer, const float32x4x2_t &values); + +template <> +inline void store_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); + vst1q_f32(buffer + 4, values.val[1]); +} + +template <> +inline void store_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vget_low_f32(values.val[0])); +} + +/** Stores a uint32_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template <unsigned int stridex> +void store_results(int32_t *buffer, const int32x4x2_t &values); + +template <> +inline void store_results<1>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, values.val[0]); + vst1q_s32(buffer + 4, values.val[1]); +} + +template <> +inline void store_results<2>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1_s32(buffer, vget_low_s32(values.val[0])); +} + +template <unsigned int stridex> +inline void accumulate_results(float *buffer, const float32x4x2_t &values); + +template <> +inline void accumulate_results<1>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); + vst1q_f32(buffer + 4, vaddq_f32(vld1q_f32(buffer + 4), values.val[1])); +} + +template <> +inline void accumulate_results<2>(float *buffer, const float32x4x2_t &values) +{ + vst1q_f32(buffer, vaddq_f32(vld1q_f32(buffer), values.val[0])); +} + +template <> +inline void accumulate_results<3>(float *buffer, const float32x4x2_t &values) +{ + vst1_f32(buffer, vadd_f32(vld1_f32(buffer), vget_low_f32(values.val[0]))); +} + +template <unsigned int stridex> +void accumulate_results(int32_t *buffer, const int32x4x2_t &values); + +template <> +inline void accumulate_results<1>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0])); + vst1q_s32(buffer + 4, vaddq_s32(vld1q_s32(buffer + 4), values.val[1])); +} + +template <> +inline void accumulate_results<2>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1q_s32(buffer, vaddq_s32(vld1q_s32(buffer), values.val[0])); +} + +template <> +inline void accumulate_results<3>(int32_t *buffer, const int32x4x2_t &values) +{ + vst1_s32(buffer, vadd_s32(vld1_s32(buffer), vget_low_s32(values.val[0]))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +/** Stores a float16x8x2_t array into a memory location. + * + * @param[in] buffer Pointer to the memory location where the values will be stored. + * @param[in] values Values that will be stored. + * + */ +template <unsigned int stridex> +void store_results(float16_t *buffer, const float16x8x2_t &values); + +template <> +inline void store_results<1>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, values.val[0]); + vst1q_f16(buffer + 8, values.val[1]); +} + +template <> +inline void store_results<2>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, values.val[0]); +} + +template <> +inline void store_results<3>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1_f16(buffer, vget_low_f16(values.val[0])); +} + +template <unsigned int stridex> +inline void accumulate_results(float16_t *buffer, const float16x8x2_t &values); + +template <> +inline void accumulate_results<1>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); + vst1q_f16(buffer + 8, vaddq_f16(vld1q_f16(buffer + 8), values.val[1])); +} + +template <> +inline void accumulate_results<2>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1q_f16(buffer, vaddq_f16(vld1q_f16(buffer), values.val[0])); +} + +template <> +inline void accumulate_results<3>(float16_t *buffer, const float16x8x2_t &values) +{ + vst1_f16(buffer, vadd_f16(vld1_f16(buffer), vget_low_f16(values.val[0]))); +} +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ + +/** Perform a 3x3 convolution for 4 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +inline float32x4_t single_convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + dilation_x), + vld1q_f32(in_top + 2 * dilation_x) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + dilation_x), + vld1q_f32(in_mid + 2 * dilation_x) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + dilation_x), + vld1q_f32(in_low + 2 * dilation_x) + } + }; + float32x4_t out = vmulq_f32(vtop.val[0], m0.val[0]); + out = vmlaq_f32(out, vtop.val[1], m0.val[1]); + out = vmlaq_f32(out, vtop.val[2], m0.val[2]); + + out = vmlaq_f32(out, vmid.val[0], m1.val[0]); + out = vmlaq_f32(out, vmid.val[1], m1.val[1]); + out = vmlaq_f32(out, vmid.val[2], m1.val[2]); + + out = vmlaq_f32(out, vlow.val[0], m2.val[0]); + out = vmlaq_f32(out, vlow.val[1], m2.val[1]); + out = vmlaq_f32(out, vlow.val[2], m2.val[2]); + + return out; +} + +/** Perform a 3x3 convolution for 8 consecutive elements on float32 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +inline float32x4x2_t convolve_3x3_dilation(const float *in_top, const float *in_mid, const float *in_low, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + const size_t dilation_x, unsigned int stridex, int input_offset = 0) +{ + ARM_COMPUTE_ERROR_ON(stridex > 3); + float32x4x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) + } + }; + + if(stridex == 2) + { + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 0), out.val[0], 2); + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[1], 2), out.val[0], 3); + } + else if(stridex == 3) + { + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + } + + return out; +} + +/** Perform a convolve3x3 on float32. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[out] out_ptr Pointer to the output. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +template <bool accumulate> +void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + unsigned int stridex, int input_offset = 0); + +template <bool accumulate> +inline void convolve_3x3(const float *in_top, const float *in_mid, const float *in_low, float *out_ptr, + const float32x4x3_t &m0, const float32x4x3_t &m1, const float32x4x3_t &m2, + unsigned int stridex, int input_offset) +{ + ARM_COMPUTE_UNUSED(input_offset); + ARM_COMPUTE_ERROR_ON(stridex > 3); + + float32x4x2_t out = + { + { + vdupq_n_f32(0.f), + vdupq_n_f32(0.f) + } + }; + if(stridex == 2) + { + const float32x4x2_t vtop = vld2q_f32(in_top); + const float32x4x2_t vmid = vld2q_f32(in_mid); + const float32x4x2_t vlow = vld2q_f32(in_low); + const float32x4_t vtop_end = vld1q_f32(in_top + 8); + const float32x4_t vmid_end = vld1q_f32(in_mid + 8); + const float32x4_t vlow_end = vld1q_f32(in_low + 8); + + out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); + + out.val[0] = vmlaq_f32(out.val[0], vtop.val[1], m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop_end, 1), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vmid.val[1], m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid_end, 1), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vlow.val[1], m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow_end, 1), m2.val[2]); + + accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); + } + else + { + const float32x4x3_t vtop = + { + { + vld1q_f32(in_top), + vld1q_f32(in_top + 4), + vld1q_f32(in_top + 8) + } + }; + const float32x4x3_t vmid = + { + { + vld1q_f32(in_mid), + vld1q_f32(in_mid + 4), + vld1q_f32(in_mid + 8) + } + }; + const float32x4x3_t vlow = + { + { + vld1q_f32(in_low), + vld1q_f32(in_low + 4), + vld1q_f32(in_low + 8) + } + }; + out.val[0] = vmulq_f32(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f32(vtop.val[1], m0.val[0]); + + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 1), m0.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vtop.val[0], vtop.val[1], 2), m0.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vmid.val[0], m1.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 1), m1.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vmid.val[0], vmid.val[1], 2), m1.val[2]); + + out.val[0] = vmlaq_f32(out.val[0], vlow.val[0], m2.val[0]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 1), m2.val[1]); + out.val[0] = vmlaq_f32(out.val[0], vextq_f32(vlow.val[0], vlow.val[1], 2), m2.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 1), m0.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vtop.val[1], vtop.val[2], 2), m0.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vmid.val[1], m1.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 1), m1.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vmid.val[1], vmid.val[2], 2), m1.val[2]); + + out.val[1] = vmlaq_f32(out.val[1], vlow.val[1], m2.val[0]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 1), m2.val[1]); + out.val[1] = vmlaq_f32(out.val[1], vextq_f32(vlow.val[1], vlow.val[2], 2), m2.val[2]); + + if(stridex == 3) + { + out.val[0] = vsetq_lane_f32(vgetq_lane_f32(out.val[0], 3), out.val[0], 1); + accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); + } + else + { + accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); + } + } +} + +/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) > +inline int32x4_t single_convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + size_t dilation_x, int32_t input_offset) +{ + using VectorType = typename std::conditional<std::is_same<T, uint8_t>::value, uint8x8x3_t, int8x8x3_t>::type; + using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>; + + const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); + + const VectorType vtop = + { + { + wrapper::vload(in_top), + wrapper::vload(in_top + dilation_x), + wrapper::vload(in_top + 2 * dilation_x) + } + }; + const VectorType vmid = + { + { + wrapper::vload(in_mid), + wrapper::vload(in_mid + dilation_x), + wrapper::vload(in_mid + 2 * dilation_x) + } + }; + const VectorType vlow = + { + { + wrapper::vload(in_low), + wrapper::vload(in_low + dilation_x), + wrapper::vload(in_low + 2 * dilation_x) + } + }; + + const int32x4x3_t vtop_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[2])))), + } + }; + const int32x4x3_t vmid_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[2])))), + } + }; + const int32x4x3_t vlow_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[2])))), + } + }; + + int32x4_t out = wrapper::vmul(vtop_s32.val[0], m0.val[0]); + out = wrapper::vmla(out, vtop_s32.val[1], m0.val[1]); + out = wrapper::vmla(out, vtop_s32.val[2], m0.val[2]); + + out = wrapper::vmla(out, vmid_s32.val[0], m1.val[0]); + out = wrapper::vmla(out, vmid_s32.val[1], m1.val[1]); + out = wrapper::vmla(out, vmid_s32.val[2], m1.val[2]); + + out = wrapper::vmla(out, vlow_s32.val[0], m2.val[0]); + out = wrapper::vmla(out, vlow_s32.val[1], m2.val[1]); + out = wrapper::vmla(out, vlow_s32.val[2], m2.val[2]); + + return out; +} + +/** Perform a 3x3 convolution for 4 consecutive 8-bit elements when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +template < typename T, REQUIRES_TA(std::is_same<T, uint8_t>::value || std::is_same<T, int8_t>::value) > +inline int32x4x2_t convolve_3x3_dilation(const T *in_top, const T *in_mid, const T *in_low, const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + const size_t dilation_x, unsigned int stridex, int input_offset) +{ + ARM_COMPUTE_ERROR_ON(stridex > 3); + int32x4x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 4, in_mid + 4, in_low + 4, m0, m1, m2, dilation_x, input_offset) + } + }; + + if(stridex == 2) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); + } + else if(stridex == 3) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); + } + return out; +} + +/** Perform a convolve3x3 on 8-bit elements + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[out] out_ptr Pointer to the output. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset Input quantization offset. + * + */ +template < bool accumulate, typename T1, typename T2, REQUIRES_TA(std::is_same<T1, uint8_t>::value || std::is_same<T1, int8_t>::value) > +void convolve_3x3(const T1 *in_top, const T1 *in_mid, const T1 *in_low, T2 *out_ptr, + const int32x4x3_t &m0, const int32x4x3_t &m1, const int32x4x3_t &m2, + unsigned int stridex, int32_t input_offset) +{ + ARM_COMPUTE_ERROR_ON(stridex > 3); + using VectorType = typename std::conditional<std::is_same<T1, uint8_t>::value, uint8x8x2_t, int8x8x2_t>::type; + using OutputTagType = typename wrapper::traits::neon_bitvector_tag_t<int32_t, wrapper::traits::BitWidth::W128>; + + const int32x4_t v_input_offset = wrapper::vdup_n(input_offset, OutputTagType{}); + + const VectorType vtop = + { + { + wrapper::vload(in_top), + wrapper::vload(in_top + 8) + } + }; + const VectorType vmid = + { + { + wrapper::vload(in_mid), + wrapper::vload(in_mid + 8) + } + }; + const VectorType vlow = + { + { + wrapper::vload(in_low), + wrapper::vload(in_low + 8) + } + }; + + const int32x4x3_t vtop_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vtop.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vtop.val[1])))), + } + }; + const int32x4x3_t vmid_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vmid.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vmid.val[1])))), + } + }; + const int32x4x3_t vlow_s32 = + { + { + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgethigh(wrapper::vmovl(vlow.val[0])))), + wrapper::vaddw(v_input_offset, wrapper::vreinterpret(wrapper::vgetlow(wrapper::vmovl(vlow.val[1])))), + } + }; + + int32x4x2_t out + { + { + wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}), + wrapper::vdup_n(static_cast<int32_t>(0), OutputTagType{}), + } + }; + + // 0 + out.val[0] = wrapper::vmla(out.val[0], vtop_s32.val[0], m0.val[0]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vtop_s32.val[0], vtop_s32.val[1]), m0.val[1]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vtop_s32.val[0], vtop_s32.val[1]), m0.val[2]); + + out.val[0] = wrapper::vmla(out.val[0], vmid_s32.val[0], m1.val[0]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vmid_s32.val[0], vmid_s32.val[1]), m1.val[1]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vmid_s32.val[0], vmid_s32.val[1]), m1.val[2]); + + out.val[0] = wrapper::vmla(out.val[0], vlow_s32.val[0], m2.val[0]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_1(vlow_s32.val[0], vlow_s32.val[1]), m2.val[1]); + out.val[0] = wrapper::vmla(out.val[0], wrapper::vext_2(vlow_s32.val[0], vlow_s32.val[1]), m2.val[2]); + + // 1 + out.val[1] = wrapper::vmla(out.val[1], vtop_s32.val[1], m0.val[0]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vtop_s32.val[1], vtop_s32.val[2]), m0.val[1]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vtop_s32.val[1], vtop_s32.val[2]), m0.val[2]); + + out.val[1] = wrapper::vmla(out.val[1], vmid_s32.val[1], m1.val[0]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vmid_s32.val[1], vmid_s32.val[2]), m1.val[1]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vmid_s32.val[1], vmid_s32.val[2]), m1.val[2]); + + out.val[1] = wrapper::vmla(out.val[1], vlow_s32.val[1], m2.val[0]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_1(vlow_s32.val[1], vlow_s32.val[2]), m2.val[1]); + out.val[1] = wrapper::vmla(out.val[1], wrapper::vext_2(vlow_s32.val[1], vlow_s32.val[2]), m2.val[2]); + + if(stridex == 1) + { + accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); + } + else if(stridex == 2) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 2), out.val[0], 1); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 0), out.val[0], 2); + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[1], 2), out.val[0], 3); + + accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); + } + else if(stridex == 3) + { + out.val[0] = wrapper::vsetlane(wrapper::vgetlane(out.val[0], 3), out.val[0], 1); + accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); + } +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +/** Loads a 3x3 matrix as a row (float16_t). + * + * @param[in] ptr Pointer to a float 3x3 matrix. + * + * @return The loaded matrix. + */ +inline float16x8x3_t load_matrix_row(const float16_t *ptr, int weights_offset = 0) +{ + ARM_COMPUTE_UNUSED(weights_offset); + /* ptr is a pointer to a row in a 3x3 matrix, the function returns 3 vectors holding exactly the same value in all lanes: + r.val[0] contains the first element, r.val[1] the second element and r.val[2] the third element (in all lanes) */ + const float16x8x3_t r = + { + { + vld1q_dup_f16(ptr), + vld1q_dup_f16(1 + ptr), + vld1q_dup_f16(2 + ptr) + } + }; + return r; +} + +/** Perform a 3x3 convolution for 8 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] input_offset (Optional)Input quantization offset. + * + */ +inline float16x8_t single_convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, int input_offset = 0) +{ + ARM_COMPUTE_UNUSED(input_offset); + const float16x8x3_t vtop = + { + { + vld1q_f16(in_top), + vld1q_f16(in_top + dilation_x), + vld1q_f16(in_top + 2 * dilation_x) + } + }; + const float16x8x3_t vmid = + { + { + vld1q_f16(in_mid), + vld1q_f16(in_mid + dilation_x), + vld1q_f16(in_mid + 2 * dilation_x) + } + }; + const float16x8x3_t vlow = + { + { + vld1q_f16(in_low), + vld1q_f16(in_low + dilation_x), + vld1q_f16(in_low + 2 * dilation_x) + } + }; + float16x8_t out = vmulq_f16(vtop.val[0], m0.val[0]); + out = vaddq_f16(out, vmulq_f16(vtop.val[1], m0.val[1])); + out = vaddq_f16(out, vmulq_f16(vtop.val[2], m0.val[2])); + + out = vaddq_f16(out, vmulq_f16(vmid.val[0], m1.val[0])); + out = vaddq_f16(out, vmulq_f16(vmid.val[1], m1.val[1])); + out = vaddq_f16(out, vmulq_f16(vmid.val[2], m1.val[2])); + + out = vaddq_f16(out, vmulq_f16(vlow.val[0], m2.val[0])); + out = vaddq_f16(out, vmulq_f16(vlow.val[1], m2.val[1])); + out = vaddq_f16(out, vmulq_f16(vlow.val[2], m2.val[2])); + + return out; +} + +/** Perform a 3x3 convolution for 16 consecutive elements on float16 when dilation.x() or dilation.y() is not 1. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] dilation_x Dilation, in elements across x. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +inline float16x8x2_t convolve_3x3_dilation(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + const size_t dilation_x, unsigned int stridex, int input_offset = 0) +{ + float16x8x2_t out = + { + { + single_convolve_3x3_dilation(in_top, in_mid, in_low, m0, m1, m2, dilation_x, input_offset), + single_convolve_3x3_dilation(in_top + 8, in_mid + 8, in_low + 8, m0, m1, m2, dilation_x, input_offset) + } + }; + + if(stridex == 2) + { + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); + } + else if(stridex == 3) + { + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); + } + + return out; +} + +/** Perform a convolve3x3 on float16. + * + * @param[in] in_top Pointer to the first row of the input. + * @param[in] in_mid Pointer to the second row of the input. + * @param[in] in_low Pointer to the third row of the input. + * @param[out] out_ptr Pointer to the output. + * @param[in] m0 First row of the filter. + * @param[in] m1 Second row of the filter. + * @param[in] m2 Third row of the filter. + * @param[in] stridex Stride value in elements across x. + * @param[in] input_offset (Optional) Input quantization offset. + * + */ +template <bool accumulate> +inline void convolve_3x3(const float16_t *in_top, const float16_t *in_mid, const float16_t *in_low, float16_t *out_ptr, + const float16x8x3_t &m0, const float16x8x3_t &m1, const float16x8x3_t &m2, + unsigned int stridex, int input_offset = 0) +{ + ARM_COMPUTE_UNUSED(input_offset); + + float16x8x2_t out = + { + { + vdupq_n_f16(0), + vdupq_n_f16(0) + } + }; + if(stridex == 2) + { + const float16x8x2_t vtop = vld2q_f16(in_top); + const float16x8x2_t vmid = vld2q_f16(in_mid); + const float16x8x2_t vlow = vld2q_f16(in_low); + const float16x8_t vtop_end = vld1q_f16(in_top + 16); + const float16x8_t vmid_end = vld1q_f16(in_mid + 16); + const float16x8_t vlow_end = vld1q_f16(in_low + 16); + + out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vtop.val[1], m0.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop_end, 1), m0.val[2])); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[1], m1.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid_end, 1), m1.val[2])); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[1], m2.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow_end, 1), m2.val[2])); + + accumulate ? accumulate_results<2>(out_ptr, out) : store_results<2>(out_ptr, out); + } + else + { + const float16x8x3_t vtop = + { + { + vld1q_f16(in_top), + vld1q_f16(in_top + 8), + vld1q_f16(in_top + 16) + } + }; + const float16x8x3_t vmid = + { + { + vld1q_f16(in_mid), + vld1q_f16(in_mid + 8), + vld1q_f16(in_mid + 16) + } + }; + const float16x8x3_t vlow = + { + { + vld1q_f16(in_low), + vld1q_f16(in_low + 8), + vld1q_f16(in_low + 16) + } + }; + out.val[0] = vmulq_f16(vtop.val[0], m0.val[0]); + out.val[1] = vmulq_f16(vtop.val[1], m0.val[0]); + + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 1), m0.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vtop.val[0], vtop.val[1], 2), m0.val[2])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vmid.val[0], m1.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 1), m1.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vmid.val[0], vmid.val[1], 2), m1.val[2])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vlow.val[0], m2.val[0])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 1), m2.val[1])); + out.val[0] = vaddq_f16(out.val[0], vmulq_f16(vextq_f16(vlow.val[0], vlow.val[1], 2), m2.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 1), m0.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vtop.val[1], vtop.val[2], 2), m0.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vmid.val[1], m1.val[0])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 1), m1.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vmid.val[1], vmid.val[2], 2), m1.val[2])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vlow.val[1], m2.val[0])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 1), m2.val[1])); + out.val[1] = vaddq_f16(out.val[1], vmulq_f16(vextq_f16(vlow.val[1], vlow.val[2], 2), m2.val[2])); + + if(stridex == 3) + { + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); + + accumulate ? accumulate_results<3>(out_ptr, out) : store_results<3>(out_ptr, out); + } + else + { + accumulate ? accumulate_results<1>(out_ptr, out) : store_results<1>(out_ptr, out); + } + } +} +#endif /** __ARM_FEATURE_FP16_VECTOR_ARITHMETIC **/ + +/** Get the number of elements processed on 3x3 convolution. + * + * @param[in] num_elems_written_per_iteration Number of elements written per iteration on 3x3 convolution. + * @param[in] stridex Stride value in elements across x. + * + * @return The number of elements processed. + */ +inline int get_input_num_elems_processed(unsigned int num_elems_written_per_iteration, unsigned int stridex) +{ + switch(stridex) + { + case 1: + return num_elems_written_per_iteration; + case 2: + return num_elems_written_per_iteration << 1; + case 3: + return num_elems_written_per_iteration * 3; + default: + ARM_COMPUTE_ERROR("stridex not supported"); + return 0; + } +} +} +} // namespace arm_compute +#endif /* ARM_COMPUTE_NEDIRECTCONVOLUTIONDETAIL_H */ |