From e417ff1d9fde119a238582a3b1feb914edd95c38 Mon Sep 17 00:00:00 2001 From: Michalis Spyrou Date: Tue, 28 Jun 2022 19:46:42 +0100 Subject: Fix build errors on armv8.6 SVE2 with NDK 23 and 24 Extensive use of templates resulted in a compiler crash on NDK 23 and 24. This rework solves the issue and also reduces the library size by 101Kb. Resolves: COMPMID-5384 Change-Id: I9c5c68c5e36f236b0891e44d25478743417fb16d Signed-off-by: Michalis Spyrou Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/7871 Reviewed-by: Gunes Bayir Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Benchmark: Arm Jenkins --- .../kernels/elementwise_binary/generic/sve2/impl.h | 303 +++++++++++---------- 1 file changed, 157 insertions(+), 146 deletions(-) (limited to 'src/cpu/kernels/elementwise_binary/generic/sve2/impl.h') diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h index f34d05eb37..41e0ac77db 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h @@ -31,37 +31,6 @@ namespace cpu { using namespace arm_compute::wrapper; -template -struct QuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; - - const svint32_t &in1_offset; - const svint32_t &in2_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &in2_scale; - const svfloat32_t &out_scale; -}; - -template -struct BroadcastQuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - float broadcast_value; - OutputScalarType *output_ptr; - bool reorder; - - const svint32_t &in1_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &out_scale; -}; - inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) { auto x = svld1(pg, ptr); @@ -131,98 +100,143 @@ inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svst1(pg, ptr, narrowed); } -template -inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) +template +void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) { - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + const auto all_true_pg = wrapper::svptrue(); - const auto result = svcreate4( - elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); -template -inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; + const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); + const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - const auto result = svcreate4( - elementwise_arithmetic_op(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_arithmetic_op(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_arithmetic_op(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_arithmetic_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} + const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); -template -inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); + const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); - using OutputVectorType = typename wrapper::traits::sve_vector::type; + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - const auto result = svcreate4( - elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float broadcast_value_f = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); -template -inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; + svfloat32x4_t result{}; - using OutputVectorType = typename wrapper::traits::sve_vector::type; + if(!is_broadcast_input_2) + { + result = svcreate4( + elementwise_arithmetic_op(pg, svget4(in2, 0), svget4(in1, 0), op), + elementwise_arithmetic_op(pg, svget4(in2, 1), svget4(in1, 1), op), + elementwise_arithmetic_op(pg, svget4(in2, 2), svget4(in1, 2), op), + elementwise_arithmetic_op(pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = svcreate4( + elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), op)); + } - const auto result = svcreate4( - elementwise_comparison_op(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_comparison_op(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_comparison_op(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_comparison_op(pg, svget4(af, 3), svget4(bf, 3), args.op)); + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); + const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); -template -using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments &); + const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); + const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); -template -using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments &); + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast(output.ptr()); + const auto input1_ptr = reinterpret_cast(input1.ptr()); + const auto input2_ptr = reinterpret_cast(input2.ptr()); -template ::type, - typename OutputScalarType = typename wrapper::sve_scalar::type> -void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopQuantizedFuncType func, - BroadcastQuantizedLoopFuncType broadcast_func) + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + + const auto result = svcreate4( + elementwise_arithmetic_op(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op(pg, svget4(in1, 3), svget4(in2, 3), op)); + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt(); + pg = wrapper::svwhilelt(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template +void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) { + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + + using OutputVectorType = typename wrapper::traits::sve_vector::type; const auto all_true_pg = wrapper::svptrue(); // Create input windows @@ -237,9 +251,6 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o const auto window_end_x = static_cast(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); - const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - if(is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -266,23 +277,40 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o auto output_ptr = reinterpret_cast(output.ptr()); const auto non_broadcast_input_ptr = reinterpret_cast(non_broadcast_input.ptr()); const InputScalarType broadcast_value = *reinterpret_cast(broadcast_input.ptr()); + const float broadcast_value_f = Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); int x = window_start_x; svbool_t pg = wrapper::svwhilelt(x, window_end_x); do { - const auto args = BroadcastQuantizedLoopArguments + const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svuint8x4_t result{}; + + if(!is_broadcast_input_2) { - op, - non_broadcast_input_ptr + x, - Qasymm8QuantizationHelper::dequantize(broadcast_value, broadcast_qinfo), - output_ptr + x, - !is_broadcast_input_2, - non_broadcast_voffset, output_voffset, - non_broadcast_vscale, output_vscale - }; - broadcast_func(pg, args); + result = svcreate4( + elementwise_comparison_op(pg, svget4(in2, 0), svget4(in1, 0), op), + elementwise_comparison_op(pg, svget4(in2, 1), svget4(in1, 1), op), + elementwise_comparison_op(pg, svget4(in2, 2), svget4(in1, 2), op), + elementwise_comparison_op(pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = svcreate4( + elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + x += wrapper::svcnt(); pg = wrapper::svwhilelt(x, window_end_x); } @@ -317,16 +345,19 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o svbool_t pg = wrapper::svwhilelt(x, window_end_x); do { - const auto args = QuantizedLoopArguments - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x, - in1_voffset, in2_voffset, output_voffset, - in1_vscale, in2_vscale, output_vscale - }; - func(pg, args); + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + const auto result = svcreate4( + elementwise_comparison_op(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_comparison_op(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_comparison_op(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_comparison_op(pg, svget4(in1, 3), svget4(in2, 3), op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + x += wrapper::svcnt(); pg = wrapper::svwhilelt(x, window_end_x); } @@ -335,26 +366,6 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o input1, input2, output); } } - -template -void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename wrapper::traits::sve_vector::type; - elementwise_quantized_op(in1, in2, out, window, op, - &arithmetic_op_quantized_loop, - &arithmetic_op_broadcast_quantized_loop); -} - -template -void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename wrapper::traits::sve_vector::type; - using OutputVectorType = typename wrapper::traits::sve_vector::type; - elementwise_quantized_op(in1, in2, out, window, op, - &comparison_op_quantized_loop, - &comparison_op_broadcast_quantized_loop); -} } // namespace cpu } // namespace arm_compute -- cgit v1.2.1