diff options
Diffstat (limited to 'src/core')
-rw-r--r-- | src/core/NEON/kernels/NEBox3x3Kernel.cpp | 9 | ||||
-rw-r--r-- | src/core/NEON/kernels/NEReductionOperationKernel.cpp | 2 |
2 files changed, 7 insertions, 4 deletions
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp index 48b959c308..7ca5e3c65c 100644 --- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp +++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp @@ -33,7 +33,8 @@ using namespace arm_compute; -int16x8_t calculate_kernel( const uint8x16_t &top_data, const uint8x16_t &mid_data, const uint8x16_t &bot_data){ +int16x8_t calculate_kernel(const uint8x16_t &top_data, const uint8x16_t &mid_data, const uint8x16_t &bot_data) +{ const int16x8x2_t top_s16 = { { @@ -101,8 +102,8 @@ void NEBox3x3FP16Kernel::run(const Window &window, const ThreadInfo &info) int16x8_t out = calculate_kernel(top_data, mid_data, bot_data); - float16x8_t outfloat = vcvtq_f16_u16(out); - outfloat = vmulq_f16(outfloat, oneovernine); + float16x8_t outfloat = vcvtq_f16_s16(out); + outfloat = vmulq_f16(outfloat, oneovernine); vst1_u8(output.ptr(), vqmovun_s16(vcvtq_s16_f16(outfloat))); }, @@ -182,7 +183,7 @@ void NEBox3x3Kernel::run(const Window &window, const ThreadInfo &info) outfloathigh = vshrq_n_s32(outfloathigh, shift); outfloatlow = vshrq_n_s32(outfloatlow, shift); out = vcombine_s16(vqmovn_s32((outfloatlow)), - vqmovn_s32((outfloathigh))); + vqmovn_s32((outfloathigh))); vst1_u8(output.ptr(), vqmovun_s16(out)); }, diff --git a/src/core/NEON/kernels/NEReductionOperationKernel.cpp b/src/core/NEON/kernels/NEReductionOperationKernel.cpp index 1bfef27d49..67ccc5d736 100644 --- a/src/core/NEON/kernels/NEReductionOperationKernel.cpp +++ b/src/core/NEON/kernels/NEReductionOperationKernel.cpp @@ -64,6 +64,7 @@ uint32x4x4_t calculate_index(uint32_t idx, T a, T b, uint32x4x4_t c, ReductionOp return res; } +template <> uint32x4x4_t calculate_index(uint32_t idx, uint8x16_t a, uint8x16_t b, uint32x4x4_t c, ReductionOperation op, int axis) { uint32x4x4_t mask{ { 0 } }; @@ -227,6 +228,7 @@ uint32_t calculate_vector_index(uint32x4x4_t vec_res_idx, uint8x16_t vec_res_val return (res - 0xFFFFFFFF); } #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +template <> uint32x4x4_t calculate_index(uint32_t idx, float16x8_t a, float16x8_t b, uint32x4x4_t c, ReductionOperation op, int axis) { uint32x4x2_t mask{ 0 }; |