aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp35
-rw-r--r--tests/validation/NEON/ArithmeticSubtraction.cpp17
-rw-r--r--tests/validation/fixtures/ArithmeticOperationsFixture.h14
3 files changed, 48 insertions, 18 deletions
diff --git a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
index 8bfb37ea18..2b3fce3fea 100644
--- a/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
+++ b/src/core/NEON/kernels/NEArithmeticSubtractionKernel.cpp
@@ -182,11 +182,7 @@ void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const W
const UniformQuantizationInfo iq2_info = in2->info()->quantization_info().uniform();
const UniformQuantizationInfo oq_info = out->info()->quantization_info().uniform();
- const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
- const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
const float32x4_t invvscaleo = vdupq_n_f32(1.f / oq_info.scale);
- const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
- const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
const float32x4_t voffseto = vdupq_n_f32(oq_info.offset);
if(is_broadcast_across_x)
@@ -198,6 +194,10 @@ void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const W
const ITensor *non_broadcast_tensor = !is_broadcast_input_2 ? in2 : in1;
const UniformQuantizationInfo broadcast_qinfo = broadcast_tensor->info()->quantization_info().uniform();
const UniformQuantizationInfo non_broadcast_qinfo = non_broadcast_tensor->info()->quantization_info().uniform();
+ const float32x4_t vscale1 = is_broadcast_input_2 ? vdupq_n_f32(iq1_info.scale) : vdupq_n_f32(iq2_info.scale);
+ const float32x4_t vscale2 = is_broadcast_input_2 ? vdupq_n_f32(iq2_info.scale) : vdupq_n_f32(iq1_info.scale);
+ const int32x4_t voffset1 = is_broadcast_input_2 ? vdupq_n_s32(iq1_info.offset) : vdupq_n_s32(iq2_info.offset);
+ const int32x4_t voffset2 = is_broadcast_input_2 ? vdupq_n_s32(iq2_info.offset) : vdupq_n_s32(iq1_info.offset);
// Clear X Dimension on execution window as we handle manually
non_broadcast_win.set(Window::DimX, Window::Dimension(0, 1, 1));
@@ -223,7 +223,6 @@ void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const W
vmulq_f32(vcvtq_f32_s32(vsubq_s32(wrapper::vreinterpret(wrapper::vmovl(wrapper::vgethigh(wrapper::vmovl(wrapper::vgethigh(broadcast_value_vec))))), voffset2)), vscale2),
}
};
- const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
// Compute S elements per iteration
int x = window_start_x;
@@ -244,16 +243,16 @@ void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const W
const int32x4x4_t rf =
{
{
-#ifdef __aarch64__
- vcvtnq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtnq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
+#ifdef __aarch64_
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
+ vcvtnq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
#else //__aarch64__
- vcvtq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
- vcvtq_s32_f32(vmlaq_f32(voffseto, is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[0], af.val[0]) : vsubq_f32(af.val[0], bf.val[0]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[1], af.val[1]) : vsubq_f32(af.val[1], bf.val[1]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[2], af.val[2]) : vsubq_f32(af.val[2], bf.val[2]), invvscaleo)),
+ vcvtq_s32_f32(vmlaq_f32(voffseto, !is_broadcast_input_2 ? vsubq_f32(bf.val[3], af.val[3]) : vsubq_f32(af.val[3], bf.val[3]), invvscaleo)),
#endif //__aarch64__
}
};
@@ -267,13 +266,19 @@ void sub_quantized(const ITensor *in1, const ITensor *in2, ITensor *out, const W
for(; x < window_end_x; ++x)
{
const float afs = static_cast<int32_t>(*(non_broadcast_input_ptr + x) - non_broadcast_qinfo.offset) * non_broadcast_qinfo.scale;
- *(output_ptr + x) = quantize<T>((afs - bfs), out->info()->quantization_info());
+ const float bfs = static_cast<int32_t>(broadcast_value - broadcast_qinfo.offset) * broadcast_qinfo.scale;
+ *(output_ptr + x) = quantize<T>(is_broadcast_input_2 ? afs - bfs : bfs - afs, out->info()->quantization_info());
}
},
broadcast_input, non_broadcast_input, output);
}
else
{
+ const float32x4_t vscale1 = vdupq_n_f32(iq1_info.scale);
+ const float32x4_t vscale2 = vdupq_n_f32(iq2_info.scale);
+ const int32x4_t voffset1 = vdupq_n_s32(iq1_info.offset);
+ const int32x4_t voffset2 = vdupq_n_s32(iq2_info.offset);
+
// Clear X Dimension on execution window as we handle manually
input1_win.set(Window::DimX, Window::Dimension(0, 1, 1));
input2_win.set(Window::DimX, Window::Dimension(0, 1, 1));
diff --git a/tests/validation/NEON/ArithmeticSubtraction.cpp b/tests/validation/NEON/ArithmeticSubtraction.cpp
index 420d61d1ee..8f9924becc 100644
--- a/tests/validation/NEON/ArithmeticSubtraction.cpp
+++ b/tests/validation/NEON/ArithmeticSubtraction.cpp
@@ -142,9 +142,10 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionFixture<uint8_t>, framew
}
TEST_SUITE_END() // U8
-using NEArithmeticSubtractionQASYMM8Fixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, uint8_t>;
-using NEArithmeticSubtractionQASYMM8SignedFixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
-using NEArithmeticSubtractionQSYMM16Fixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int16_t>;
+using NEArithmeticSubtractionQASYMM8Fixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, uint8_t>;
+using NEArithmeticSubtractionQASYMM8SignedFixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
+using NEArithmeticSubtractionQASYMM8SignedBroadcastFixture = ArithmeticSubtractionValidationQuantizedBroadcastFixture<Tensor, Accessor, NEArithmeticSubtraction, int8_t>;
+using NEArithmeticSubtractionQSYMM16Fixture = ArithmeticSubtractionValidationQuantizedFixture<Tensor, Accessor, NEArithmeticSubtraction, int16_t>;
TEST_SUITE(Quantized)
TEST_SUITE(QASYMM8)
@@ -167,6 +168,16 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEArithmeticSubtractionQASYMM8SignedFixture, fr
// Validate output
validate(Accessor(_target), _reference, tolerance_qasymm8);
}
+
+FIXTURE_DATA_TEST_CASE(RunSmallBroadcast, NEArithmeticSubtractionQASYMM8SignedBroadcastFixture, framework::DatasetMode::ALL, combine(combine(combine(
+ datasets::SmallShapesBroadcast(),
+ ArithmeticSubtractionQASYMM8SIGNEDDataset),
+ framework::dataset::make("ConvertPolicy", { ConvertPolicy::SATURATE })),
+ ArithmeticSubtractionQuantizationInfoSignedDataset))
+{
+ // Validate output
+ validate(Accessor(_target), _reference, tolerance_qasymm8);
+}
TEST_SUITE_END() // QASYMM8_SIGNED
TEST_SUITE(QSYMM16)
diff --git a/tests/validation/fixtures/ArithmeticOperationsFixture.h b/tests/validation/fixtures/ArithmeticOperationsFixture.h
index 1019e60233..05c55a0b63 100644
--- a/tests/validation/fixtures/ArithmeticOperationsFixture.h
+++ b/tests/validation/fixtures/ArithmeticOperationsFixture.h
@@ -265,6 +265,20 @@ public:
convert_policy, qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), false);
}
};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T>
+class ArithmeticSubtractionValidationQuantizedBroadcastFixture : public ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>
+{
+public:
+ template <typename...>
+ void setup(const TensorShape &shape0, const TensorShape &shape1, DataType data_type0, DataType data_type1, DataType output_data_type,
+ ConvertPolicy convert_policy, QuantizationInfo qinfo0, QuantizationInfo qinfo1, QuantizationInfo qinfo_out)
+ {
+ ArithmeticOperationGenericFixture<TensorType, AccessorType, FunctionType, T>::setup(reference::ArithmeticOperation::SUB, shape0, shape1,
+ data_type0, data_type1, output_data_type, convert_policy,
+ qinfo0, qinfo1, qinfo_out, ActivationLayerInfo(), false);
+ }
+};
} // namespace validation
} // namespace test
} // namespace arm_compute