diff options
4 files changed, 54 insertions, 17 deletions
diff --git a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h index d56fd44700..b245505ac6 100644 --- a/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h +++ b/arm_compute/core/NEON/kernels/detail/NEDirectConvolutionDetail.h @@ -460,8 +460,12 @@ inline float16x8x2_t convolve_3x3<2>(const float16_t *in_top, const float16_t *i { float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 2), out.val[0], 1); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 2); - out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 3); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 4), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 3); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 0), out.val[0], 4); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 2), out.val[0], 5); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 4), out.val[0], 6); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 6), out.val[0], 7); return out; } @@ -470,6 +474,8 @@ inline float16x8x2_t convolve_3x3<3>(const float16_t *in_top, const float16_t *i { float16x8x2_t out = convolve_3x3<1>(in_top, in_mid, in_low, m0, m1, m2); out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 3), out.val[0], 1); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[0], 6), out.val[0], 2); + out.val[0] = vsetq_lane_f16(vgetq_lane_f16(out.val[1], 1), out.val[0], 3); return out; } diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 59244c876c..f525d93e83 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -987,6 +987,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(width_idx) != weights->dimension(height_idx)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::NHWC && input->data_type() != DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(width_idx) > 3) && (input->data_type() == DataType::F16)); // Checks performed when output is configured if(output->total_size() != 0) @@ -1051,8 +1052,6 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen break; } case 3: - case 5: - { switch(input->data_type()) { case DataType::F32: @@ -1071,6 +1070,20 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen ARM_COMPUTE_ERROR("Data type not supported."); break; } + break; + case 5: + { + switch(input->data_type()) + { + case DataType::F32: + num_weight_elems_read_per_row = 4 + kernel_size - 1; + num_elems_read_per_iteration = 12; + num_elems_written_per_iteration = 16 >> conv_stride_x; + break; + default: + ARM_COMPUTE_ERROR("Data type not supported."); + break; + } } break; default: diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp index 58f3f0df37..18072e0532 100644 --- a/tests/validation/NEON/ConvolutionLayer.cpp +++ b/tests/validation/NEON/ConvolutionLayer.cpp @@ -50,9 +50,11 @@ namespace RelativeTolerance<float> rel_tolerance_f32(0.01f); /**< Relative tolerance for FP32 types */ const AbsoluteTolerance<float> abs_tolerance_f32(0.002f); /**< Absolute tolerance for FP32 types */ #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -const AbsoluteTolerance<float> tolerance_f16(0.01f); /**< Tolerance value for comparing reference's output against implementation's output for DataType::F16 */ -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ +const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for FP16 types */ +const AbsoluteTolerance<float> abs_tolerance_f16(0.2f); /**< Absolute tolerance for FP16 types */ +constexpr float tolerance_num = 0.07f; /**< Tolerance number for the FP16 implementation */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +constexpr AbsoluteTolerance<float> tolerance_qasymm8(0.0); /**< Tolerance value for comparing reference's output against implementation's output for quantized data types */ /** CNN data types */ const auto CNNDataTypes = framework::dataset::make("DataType", @@ -206,7 +208,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, NEGEMMConvolutionLayerFixture<half>, framework: ActivationFunctionsDataset)) { // Validate output - validate(Accessor(_target), _reference, tolerance_f16); + validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); } FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(combine(combine(datasets::LargeConvolutionLayerDataset(), framework::dataset::make("ReshapeWeights", { true })), @@ -215,7 +217,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEGEMMConvolutionLayerFixture<half>, framework: ActivationFunctionsDataset)) { // Validate output - validate(Accessor(_target), _reference, tolerance_f16); + validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); } TEST_SUITE_END() #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp index acd0e5d64b..cd186e05cd 100644 --- a/tests/validation/NEON/DirectConvolutionLayer.cpp +++ b/tests/validation/NEON/DirectConvolutionLayer.cpp @@ -43,11 +43,13 @@ namespace validation namespace { #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC -constexpr AbsoluteTolerance<float> tolerance_fp16(0.01f); /**< Tolerance for half precision floating point tests */ -#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ -constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f); /**< Tolerance for floating point tests */ +const RelativeTolerance<half_float::half> rel_tolerance_f16(half_float::half(0.2f)); /**< Relative tolerance value for FP16 types */ +const AbsoluteTolerance<float> abs_tolerance_f16(0.2f); /**< Absolute tolerance for FP16 types */ +constexpr float tolerance_num = 0.07f; /**< Tolerance number for the FP16 implementation */ +#endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ +constexpr AbsoluteTolerance<float> tolerance_fp32(0.001f); /**< Tolerance for floating point tests */ -/** Direct convolution data set. */ +/** Direct convolution data set.for FP32 */ const auto data_pad_f32 = concat(concat(combine(framework::dataset::make("PadX", { 0, 1 }), combine(framework::dataset::make("PadY", { 0, 1 }), framework::dataset::make("KernelSize", 3))), @@ -58,12 +60,26 @@ const auto data_pad_f32 = concat(concat(combine(framework::dataset::make("PadX", combine(framework::dataset::make("PadY", { 0, 3 }), framework::dataset::make("KernelSize", 5)))); +/** Direct convolution data set.for FP16 */ +const auto data_pad_f16 = concat(combine(framework::dataset::make("PadX", { 0, 1 }), + combine(framework::dataset::make("PadY", { 0, 1 }), + framework::dataset::make("KernelSize", 3))), + combine(framework::dataset::make("PadX", { 0 }), + combine(framework::dataset::make("PadY", { 0 }), + framework::dataset::make("KernelSize", 1)))); + const auto data_f32 = combine(datasets::SmallDirectConvolutionShapes(), - combine(framework::dataset::make("StrideX", { 1, 3 }), - combine(framework::dataset::make("StrideY", { 1, 3 }), + combine(framework::dataset::make("StrideX", { 1, 2, 3 }), + combine(framework::dataset::make("StrideY", { 1, 2, 3 }), combine(data_pad_f32, framework::dataset::make("NumKernels", { 1, 4, 8, 16 }))))); +const auto data_f16 = combine(datasets::SmallDirectConvolutionShapes(), + combine(framework::dataset::make("StrideX", { 1, 2, 3 }), + combine(framework::dataset::make("StrideY", { 1, 2, 3 }), + combine(data_pad_f16, + framework::dataset::make("NumKernels", { 1, 4, 8, 16 }))))); + /** Activation function Dataset*/ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo", { @@ -152,12 +168,12 @@ using NEDirectConvolutionLayerFixture = DirectConvolutionValidationFixture<Tenso TEST_SUITE(Float) #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC TEST_SUITE(FP16) -FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(data_f32, framework::dataset::make("DataType", DataType::F16)), +FIXTURE_DATA_TEST_CASE(Run, NEDirectConvolutionLayerFixture<half>, framework::DatasetMode::ALL, combine(combine(combine(data_f16, framework::dataset::make("DataType", DataType::F16)), ActivationFunctionsDataset), framework::dataset::make("DataLayout", DataLayout::NCHW))) { // Validate output - validate(Accessor(_target), _reference, tolerance_fp16); + validate(Accessor(_target), _reference, rel_tolerance_f16, tolerance_num, abs_tolerance_f16); } TEST_SUITE_END() #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ |