aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAlan Kelly <me@alankelly.dev>2019-04-10 21:37:05 +0200
committerGeorgios Pinitas <georgios.pinitas@arm.com>2019-05-17 16:25:00 +0000
commitefcefeadd0b3ccb78dea1912cfa792cd87088c10 (patch)
tree64b233772928cefc71aff8e840926f6974650555
parenta400334f60c28a464ba44170350ec475af73a94a (diff)
downloadComputeLibrary-efcefeadd0b3ccb78dea1912cfa792cd87088c10.tar.gz
Optimizes NEBox3x3FP16Kernel by removing unnecessary int to float conversions
Signed-off-by: Alan Kelly <me@alankelly.dev> Change-Id: Icf54d5f37455fc5ba0fbc5f15d6e481a7a751d74 Reviewed-on: https://review.mlplatform.org/c/1044 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
-rw-r--r--src/core/NEON/kernels/NEBox3x3Kernel.cpp41
1 files changed, 21 insertions, 20 deletions
diff --git a/src/core/NEON/kernels/NEBox3x3Kernel.cpp b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
index a401aa7e79..126b62b21e 100644
--- a/src/core/NEON/kernels/NEBox3x3Kernel.cpp
+++ b/src/core/NEON/kernels/NEBox3x3Kernel.cpp
@@ -55,52 +55,53 @@ void NEBox3x3FP16Kernel::run(const Window &window, const ThreadInfo &info)
const uint8x16_t mid_data = vld1q_u8(input_mid_ptr + input.offset());
const uint8x16_t bot_data = vld1q_u8(input_bot_ptr + input.offset());
- const float16x8x2_t top_f16 =
+ const uint16x8x2_t top_f16 =
{
{
- vcvtq_f16_u16(vmovl_u8(vget_low_u8(top_data))),
- vcvtq_f16_u16(vmovl_u8(vget_high_u8(top_data)))
+ vmovl_u8(vget_low_u8(top_data)),
+ vmovl_u8(vget_high_u8(top_data))
}
};
- const float16x8x2_t mid_f16 =
+ const uint16x8x2_t mid_f16 =
{
{
- vcvtq_f16_u16(vmovl_u8(vget_low_u8(mid_data))),
- vcvtq_f16_u16(vmovl_u8(vget_high_u8(mid_data)))
+ vmovl_u8(vget_low_u8(mid_data)),
+ vmovl_u8(vget_high_u8(mid_data))
}
};
- const float16x8x2_t bot_f16 =
+ const uint16x8x2_t bot_f16 =
{
{
- vcvtq_f16_u16(vmovl_u8(vget_low_u8(bot_data))),
- vcvtq_f16_u16(vmovl_u8(vget_high_u8(bot_data)))
+ vmovl_u8(vget_low_u8(bot_data)),
+ vmovl_u8(vget_high_u8(bot_data))
}
};
//top left
- float16x8_t out = top_f16.val[0];
+ uint16x8_t out = top_f16.val[0];
//top mid
- out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 1));
+ out = vaddq_u16(out, vextq_u16(top_f16.val[0], top_f16.val[1], 1));
//top right
- out = vaddq_f16(out, vextq_f16(top_f16.val[0], top_f16.val[1], 2));
+ out = vaddq_u16(out, vextq_u16(top_f16.val[0], top_f16.val[1], 2));
//mid left
- out = vaddq_f16(out, mid_f16.val[0]);
+ out = vaddq_u16(out, mid_f16.val[0]);
//mid mid
- out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 1));
+ out = vaddq_u16(out, vextq_u16(mid_f16.val[0], mid_f16.val[1], 1));
//mid right
- out = vaddq_f16(out, vextq_f16(mid_f16.val[0], mid_f16.val[1], 2));
+ out = vaddq_u16(out, vextq_u16(mid_f16.val[0], mid_f16.val[1], 2));
//bot left
- out = vaddq_f16(out, bot_f16.val[0]);
+ out = vaddq_u16(out, bot_f16.val[0]);
//bot mid
- out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 1));
+ out = vaddq_u16(out, vextq_u16(bot_f16.val[0], bot_f16.val[1], 1));
//bot right
- out = vaddq_f16(out, vextq_f16(bot_f16.val[0], bot_f16.val[1], 2));
+ out = vaddq_u16(out, vextq_u16(bot_f16.val[0], bot_f16.val[1], 2));
- out = vmulq_f16(out, oneovernine);
+ float16x8_t outfloat = vcvtq_f16_u16(out);
+ outfloat = vmulq_f16(outfloat, oneovernine);
- vst1_u8(output.ptr(), vqmovun_s16(vcvtq_s16_f16(out)));
+ vst1_u8(output.ptr(), vqmovun_s16(vcvtq_s16_f16(outfloat)));
},
input, output);
}