aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h10
-rw-r--r--src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp17
-rw-r--r--tests/validation/NEON/ConvolutionLayer.cpp12
-rw-r--r--tests/validation/NEON/DirectConvolutionLayer.cpp32
4 files changed, 54 insertions, 17 deletions
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
index d56fd44700..b245505ac6 100644
--- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
+++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h
@@ -460,8 +460,12 @@ inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *i
{
float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1);
- out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2);
- out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7);
return out;
}
@@ -470,6 +474,8 @@ inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *i
{
float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2);
out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2);
+ out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3);
return out;
}
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
index 59244c876c..f525d93e83 100644
--- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp
@@ -987,6 +987,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx));
ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (input->data_type() == DataType::F16));
// Checks performed when output is configured
if(output->total_size() != 0)
@@ -1051,8 +1052,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
break;
}
case 3:
- case 5:
- {
switch(input->data_type())
{
case DataType::F32:
@@ -1071,6 +1070,20 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
ARM_COMPUTE_ERROR("Data type not supported.");
break;
}
+ break;
+ case 5:
+ {
+ switch(input->data_type())
+ {
+ case DataType::F32:
+ num_weight_elems_read_per_row = 4 + kernel_size - 1;
+ num_elems_read_per_iteration = 12;
+ num_elems_written_per_iteration = 16 >> conv_stride_x;
+ break;
+ default:
+ ARM_COMPUTE_ERROR("Data type not supported.");
+ break;
+ }
}
break;
default:
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 58f3f0df37..18072e0532 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -50,9 +50,11 @@ namespace
RelativeTolerance<float> rel_tolerance_f32(0.01f); /**< Relative tolerance for FP32 types */
const AbsoluteTolerance<float> abs_tolerance_f32(0.002f); /**< Absolute tolerance for FP32 types */
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-const AbsoluteTolerance<float> tolerance_f16(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
+const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for FP16 types */
+const AbsoluteTolerance<float> abs_tolerance_f16(0.2f); /**< Absolute tolerance for FP16 types */
+constexpr float tolerance_num = 0.07f; /**< Tolerance number for the FP16 implementation */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */
/** CNN data types */
const auto CNNDataTypes = framework::dataset::make("DataType",
@@ -206,7 +208,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<half>, framework:
ActivationFunctionsDataset))
{
// Validate output
- validate(Accessor(_target), _reference, tolerance_f16);
+ validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
}
FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(),
framework::dataset::make("ReshapeWeights", { true })),
@@ -215,7 +217,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<half>, framework:
ActivationFunctionsDataset))
{
// Validate output
- validate(Accessor(_target), _reference, tolerance_f16);
+ validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
}
TEST_SUITE_END()
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp
index acd0e5d64b..cd186e05cd 100644
--- a/tests/validation/NEON/DirectConvolutionLayer.cpp
+++ b/tests/validation/NEON/DirectConvolutionLayer.cpp
@@ -43,11 +43,13 @@ namespace validation
namespace
{
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
-constexpr AbsoluteTolerance<float> tolerance_fp16(0.01f); /**< Tolerance for half precision floating point tests */
-#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
-constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f); /**< Tolerance for floating point tests */
+const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for FP16 types */
+const AbsoluteTolerance<float> abs_tolerance_f16(0.2f); /**< Absolute tolerance for FP16 types */
+constexpr float tolerance_num = 0.07f; /**< Tolerance number for the FP16 implementation */
+#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
+constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f); /**< Tolerance for floating point tests */
-/** Direct convolution data set. */
+/** Direct convolution data set.for FP32 */
const auto data_pad_f32 = concat(concat(combine(framework::dataset::make("PadX", { 0, 1 }),
combine(framework::dataset::make("PadY", { 0, 1 }),
framework::dataset::make("KernelSize", 3))),
@@ -58,12 +60,26 @@ const auto data_pad_f32 = concat(concat(combine(framework::dataset::make("PadX",
combine(framework::dataset::make("PadY", { 0, 3 }),
framework::dataset::make("KernelSize", 5))));
+/** Direct convolution data set.for FP16 */
+const auto data_pad_f16 = concat(combine(framework::dataset::make("PadX", { 0, 1 }),
+ combine(framework::dataset::make("PadY", { 0, 1 }),
+ framework::dataset::make("KernelSize", 3))),
+ combine(framework::dataset::make("PadX", { 0 }),
+ combine(framework::dataset::make("PadY", { 0 }),
+ framework::dataset::make("KernelSize", 1))));
+
const auto data_f32 = combine(datasets::SmallDirectConvolutionShapes(),
- combine(framework::dataset::make("StrideX", { 1, 3 }),
- combine(framework::dataset::make("StrideY", { 1, 3 }),
+ combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+ combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
combine(data_pad_f32,
framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))));
+const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(),
+ combine(framework::dataset::make("StrideX", { 1, 2, 3 }),
+ combine(framework::dataset::make("StrideY", { 1, 2, 3 }),
+ combine(data_pad_f16,
+ framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))));
+
/** Activation function Dataset*/
const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo",
{
@@ -152,12 +168,12 @@ using NEDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<Tenso
TEST_SUITE(Float)
#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(data_f32, framework::dataset::make("DataType", DataType::F16)),
+FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(data_f16, framework::dataset::make("DataType", DataType::F16)),
ActivationFunctionsDataset),
framework::dataset::make("DataLayout", DataLayout::NCHW)))
{
// Validate output
- validate(Accessor(_target), _reference, tolerance_fp16);
+ validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16);
}
TEST_SUITE_END()
#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */