diff options
Diffstat (limited to 'src/cpu/kernels/elementwise_binary/generic/sve2/impl.h')
-rw-r--r-- | src/cpu/kernels/elementwise_binary/generic/sve2/impl.h | 303 |
1 files changed, 157 insertions, 146 deletions
diff --git a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h index f34d05eb37..41e0ac77db 100644 --- a/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h +++ b/src/cpu/kernels/elementwise_binary/generic/sve2/impl.h @@ -31,37 +31,6 @@ namespace cpu { using namespace arm_compute::wrapper; -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -struct QuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - const InputScalarType *input2_ptr; - OutputScalarType *output_ptr; - - const svint32_t &in1_offset; - const svint32_t &in2_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &in2_scale; - const svfloat32_t &out_scale; -}; - -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -struct BroadcastQuantizedLoopArguments -{ - OperatorType op; - const InputScalarType *input1_ptr; - float broadcast_value; - OutputScalarType *output_ptr; - bool reorder; - - const svint32_t &in1_offset; - const svint32_t &out_offset; - const svfloat32_t &in1_scale; - const svfloat32_t &out_scale; -}; - inline svfloat32x4_t load_quantized(const int8_t *ptr, svbool_t pg, const svint32_t &offset, const svfloat32_t &scale) { auto x = svld1(pg, ptr); @@ -131,98 +100,143 @@ inline void store_quantized(int8_t *ptr, svbool_t pg, svfloat32x4_t data, const svst1(pg, ptr, narrowed); } -template <typename InputScalarType, typename OutputScalarType> -inline void arithmetic_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) +template <typename ScalarType> +void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ArithmeticOperation op, const Window &window) { - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + const auto all_true_pg = wrapper::svptrue<ScalarType>(); - const auto result = svcreate4( - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + // Create input windows + Window input1_win = window.broadcast_if_dimension_le_one(in1->info()->tensor_shape()); + Window input2_win = window.broadcast_if_dimension_le_one(in2->info()->tensor_shape()); - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} + // Clear X Dimension on execution window as we handle manually + Window win = window; + win.set(Window::DimX, Window::Dimension(0, 1, 1)); -template <typename InputScalarType, typename OutputScalarType> -inline void arithmetic_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ArithmeticOperation> &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + const auto window_start_x = static_cast<int>(window.x().start()); + const auto window_end_x = static_cast<int>(window.x().end()); + const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; + const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); + const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - const auto result = svcreate4( - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_arithmetic_op<svfloat32_t>(pg, svget4(af, 3), svget4(bf, 3), args.op)); + if(is_broadcast_across_x) + { + const bool is_broadcast_input_2 = input2_win.x().step() == 0; + Window broadcast_win = is_broadcast_input_2 ? input2_win : input1_win; + Window non_broadcast_win = !is_broadcast_input_2 ? input2_win : input1_win; + const ITensor *broadcast_tensor = is_broadcast_input_2 ? in2 : in1; + const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1; - store_quantized(args.output_ptr, pg, result, args.out_offset, args.out_scale); -} + const auto non_broadcast_qinfo = is_broadcast_input_2 ? in1->info()->quantization_info() : in2->info()->quantization_info(); + const auto broadcast_qinfo = is_broadcast_input_2 ? in2->info()->quantization_info() : in1->info()->quantization_info(); -template <typename InputScalarType, typename OutputScalarType> -inline void comparison_op_quantized_loop(svbool_t pg, const QuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = load_quantized(args.input2_ptr, pg, args.in2_offset, args.in2_scale); + const auto non_broadcast_voffset = svdup_n(non_broadcast_qinfo.uniform().offset); + const auto non_broadcast_vscale = svdup_n(non_broadcast_qinfo.uniform().scale); - using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; + // Clear X Dimension on execution window as we handle manually + non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1)); - const auto result = svcreate4( - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), args.op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), args.op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), args.op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), args.op)); + Iterator broadcast_input(broadcast_tensor, broadcast_win); + Iterator non_broadcast_input(non_broadcast_tensor, non_broadcast_win); + Iterator output(out, win); - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto non_broadcast_input_ptr = reinterpret_cast<const ScalarType *>(non_broadcast_input.ptr()); + const ScalarType broadcast_value = *reinterpret_cast<const ScalarType *>(broadcast_input.ptr()); + const float broadcast_value_f = Qasymm8QuantizationHelper<ScalarType>::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); -template <typename InputScalarType, typename OutputScalarType> -inline void comparison_op_broadcast_quantized_loop(svbool_t pg, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, ComparisonOperation> &args) -{ - const auto in1 = load_quantized(args.input1_ptr, pg, args.in1_offset, args.in1_scale); - const auto in2 = svcreate4( - svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value), svdup_n(args.broadcast_value)); + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); - const auto &af = args.reorder ? in2 : in1; - const auto &bf = args.reorder ? in1 : in2; + svfloat32x4_t result{}; - using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; + if(!is_broadcast_input_2) + { + result = svcreate4( + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 0), svget4(in1, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 1), svget4(in1, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 2), svget4(in1, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = svcreate4( + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); + } - const auto result = svcreate4( - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 0), svget4(bf, 0), args.op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 1), svget4(bf, 1), args.op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 2), svget4(bf, 2), args.op), - elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(af, 3), svget4(bf, 3), args.op)); + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); - const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); - const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); - const auto zipped = svzip1(zipped_bottom, zipped_top); - svst1(pg, args.output_ptr, zipped); -} + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + broadcast_input, non_broadcast_input, output); + } + else + { + // Clear X Dimension on execution window as we handle manually + input1_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + input2_win.set(Window::DimX, Window::Dimension(0, 1, 1)); + + Iterator input1(in1, input1_win); + Iterator input2(in2, input2_win); + Iterator output(out, win); + + const auto in1_voffset = svdup_n(in1->info()->quantization_info().uniform().offset); + const auto in1_vscale = svdup_n(in1->info()->quantization_info().uniform().scale); -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -using LoopQuantizedFuncType = void (*)(svbool_t, const QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + const auto in2_voffset = svdup_n(in2->info()->quantization_info().uniform().offset); + const auto in2_vscale = svdup_n(in2->info()->quantization_info().uniform().scale); -template <typename InputScalarType, typename OutputScalarType, typename OperatorType> -using BroadcastQuantizedLoopFuncType = void (*)(svbool_t, const BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> &); + execute_window_loop(win, [&](const Coordinates &) + { + auto output_ptr = reinterpret_cast<ScalarType *>(output.ptr()); + const auto input1_ptr = reinterpret_cast<const ScalarType *>(input1.ptr()); + const auto input2_ptr = reinterpret_cast<const ScalarType *>(input2.ptr()); -template <typename InputVectorType, typename OutputVectorType, typename OperatorType, - typename InputScalarType = typename wrapper::sve_scalar<InputVectorType>::type, - typename OutputScalarType = typename wrapper::sve_scalar<OutputVectorType>::type> -void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window, - OperatorType op, - LoopQuantizedFuncType<InputScalarType, OutputScalarType, OperatorType> func, - BroadcastQuantizedLoopFuncType<InputScalarType, OutputScalarType, OperatorType> broadcast_func) + int x = window_start_x; + + svbool_t pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + do + { + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + + const auto result = svcreate4( + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_arithmetic_op<svfloat32_t>(pg, svget4(in1, 3), svget4(in2, 3), op)); + + store_quantized(output_ptr + x, pg, result, output_voffset, output_vscale); + + x += wrapper::svcnt<ScalarType>(); + pg = wrapper::svwhilelt<ScalarType>(x, window_end_x); + } + while(svptest_any(all_true_pg, pg)); + }, + input1, input2, output); + } +} + +template <typename InputScalarType, typename OutputScalarType = uint8_t> +void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, ComparisonOperation op, const Window &window) { + static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); + + using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; const auto all_true_pg = wrapper::svptrue<InputScalarType>(); // Create input windows @@ -237,9 +251,6 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o const auto window_end_x = static_cast<int>(window.x().end()); const bool is_broadcast_across_x = in1->info()->tensor_shape().x() != in2->info()->tensor_shape().x(); - const auto output_voffset = svdup_n(out->info()->quantization_info().uniform().offset); - const auto output_vscale = svdup_n(1.f / out->info()->quantization_info().uniform().scale); - if(is_broadcast_across_x) { const bool is_broadcast_input_2 = input2_win.x().step() == 0; @@ -266,23 +277,40 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o auto output_ptr = reinterpret_cast<OutputScalarType *>(output.ptr()); const auto non_broadcast_input_ptr = reinterpret_cast<const InputScalarType *>(non_broadcast_input.ptr()); const InputScalarType broadcast_value = *reinterpret_cast<const InputScalarType *>(broadcast_input.ptr()); + const float broadcast_value_f = Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo); + const auto in2 = svcreate4(svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f), svdup_n(broadcast_value_f)); int x = window_start_x; svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); do { - const auto args = BroadcastQuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> + const auto in1 = load_quantized(non_broadcast_input_ptr + x, pg, non_broadcast_voffset, non_broadcast_vscale); + + svuint8x4_t result{}; + + if(!is_broadcast_input_2) { - op, - non_broadcast_input_ptr + x, - Qasymm8QuantizationHelper<InputScalarType>::dequantize(broadcast_value, broadcast_qinfo), - output_ptr + x, - !is_broadcast_input_2, - non_broadcast_voffset, output_voffset, - non_broadcast_vscale, output_vscale - }; - broadcast_func(pg, args); + result = svcreate4( + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 0), svget4(in1, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 1), svget4(in1, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 2), svget4(in1, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in2, 3), svget4(in1, 3), op)); + } + else + { + result = svcreate4( + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op)); + } + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + x += wrapper::svcnt<InputScalarType>(); pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); } @@ -317,16 +345,19 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o svbool_t pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); do { - const auto args = QuantizedLoopArguments<InputScalarType, OutputScalarType, OperatorType> - { - op, - input1_ptr + x, - input2_ptr + x, - output_ptr + x, - in1_voffset, in2_voffset, output_voffset, - in1_vscale, in2_vscale, output_vscale - }; - func(pg, args); + const auto in1 = load_quantized(input1_ptr + x, pg, in1_voffset, in1_vscale); + const auto in2 = load_quantized(input2_ptr + x, pg, in2_voffset, in2_vscale); + const auto result = svcreate4( + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 0), svget4(in2, 0), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 1), svget4(in2, 1), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 2), svget4(in2, 2), op), + elementwise_comparison_op<svfloat32_t, OutputVectorType>(pg, svget4(in1, 3), svget4(in2, 3), op)); + + const auto zipped_bottom = svzip1(svget4(result, 0), svget4(result, 1)); + const auto zipped_top = svzip1(svget4(result, 2), svget4(result, 3)); + const auto zipped = svzip1(zipped_bottom, zipped_top); + svst1(pg, output_ptr + x, zipped); + x += wrapper::svcnt<InputScalarType>(); pg = wrapper::svwhilelt<InputScalarType>(x, window_end_x); } @@ -335,26 +366,6 @@ void elementwise_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *o input1, input2, output); } } - -template <ArithmeticOperation op, typename ScalarType> -void elementwise_arithmetic_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - using VectorType = typename wrapper::traits::sve_vector<ScalarType>::type; - elementwise_quantized_op<VectorType, VectorType, ArithmeticOperation>(in1, in2, out, window, op, - &arithmetic_op_quantized_loop<ScalarType, ScalarType>, - &arithmetic_op_broadcast_quantized_loop<ScalarType, ScalarType>); -} - -template <ComparisonOperation op, typename InputScalarType, typename OutputScalarType = uint8_t> -void elementwise_comparison_quantized_op(const ITensor *in1, const ITensor *in2, ITensor *out, const Window &window) -{ - static_assert(sizeof(InputScalarType) >= sizeof(OutputScalarType), "input data type's width should be equal to or greater than output data type's width"); - using InputVectorType = typename wrapper::traits::sve_vector<InputScalarType>::type; - using OutputVectorType = typename wrapper::traits::sve_vector<OutputScalarType>::type; - elementwise_quantized_op<InputVectorType, OutputVectorType, ComparisonOperation>(in1, in2, out, window, op, - &comparison_op_quantized_loop<InputScalarType, OutputScalarType>, - &comparison_op_broadcast_quantized_loop<InputScalarType, OutputScalarType>); -} } // namespace cpu } // namespace arm_compute |