diff options
-rw-r--r-- | src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp | 12 | ||||
-rw-r--r-- | tests/validation/NEON/DirectConvolutionLayer.cpp | 27 |
2 files changed, 33 insertions, 6 deletions
diff --git a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp index 91b03687d8..559b67316f 100644 --- a/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDirectConvolutionLayerKernel.cpp @@ -979,7 +979,7 @@ public: // |__________________| // | pad_bottom | // |******************| - const int max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y; + const int64_t max_offset = input_stride_z * input_depth - (input->info()->padding().bottom + input->info()->padding().top) * input_stride_y; execute_window_loop(window_k, [&](const Coordinates & id_k) // loop on the batch size { @@ -1002,34 +1002,34 @@ public: for(int x = 0; x < input_width; x += num_elems_read_per_iteration) { // z == 0 - auto in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top); + auto in_z = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top); in_z = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth)); auto offset = y_offset + in_z * input_stride_z; offset = std::min(offset, max_offset); convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 0 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); // z == 1 - in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 1); + in_z = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 1); in_z = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth)); offset = y_offset + in_z * input_stride_z; offset = std::min(offset, max_offset); convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 1 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); // z == 2 - in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 2); + in_z = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 2); in_z = std::min(static_cast<unsigned int>(in_z), static_cast<unsigned int>(input_depth)); offset = y_offset + in_z * input_stride_z; offset = std::min(offset, max_offset); convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 2 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); // z == 3 - in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 3); + in_z = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 3); offset = y_offset + in_z * input_stride_z; offset = std::min(offset, max_offset); convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 3 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); // z == 4 - in_z = static_cast<int>(id.z() * conv_stride_y - conv_pad_top + 4); + in_z = static_cast<int64_t>(id.z() * conv_stride_y - conv_pad_top + 4); offset = y_offset + in_z * input_stride_z; convolve_row1x9_nhwc(in_ptr + offset + x, weights_ptr + 4 * kernel_stride_z + x, input_stride_y, kernel_stride_y, out0, out1, out2, out3); diff --git a/tests/validation/NEON/DirectConvolutionLayer.cpp b/tests/validation/NEON/DirectConvolutionLayer.cpp index fe61a9d98a..88578ca586 100644 --- a/tests/validation/NEON/DirectConvolutionLayer.cpp +++ b/tests/validation/NEON/DirectConvolutionLayer.cpp @@ -98,6 +98,25 @@ const auto data_f16_nightly = combine(data_f16, framework::dataset::make("NumKer const auto data_precommit = combine(data, framework::dataset::make("NumKernels", { 1 })); const auto data_precommit9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 4 })); +/* The following tests is from real use-case that made DirectConvolution + * overflows in terms of its tensor indexing. This test case is using + * a separate tolerance due to the following reason. + * - It has shown that it requires generally larger absolute tolerance + * for large numbers or larger relative tolerance for small numbers. + * - With the first reason, since it is mainly testing index overflow, + * a value with a margin is used to avoid uninteded test failures + * during nightly. + */ +constexpr AbsoluteTolerance<float> usecase_tolerance_fp32(0.05f); + +const auto data_nightly_usecase = combine(framework::dataset::make("InputShape", { TensorShape{ 3U, 800U, 800U } }), + combine(framework::dataset::make("StrideX", { 1 }), + combine(framework::dataset::make("StrideY", { 1 }), + combine(framework::dataset::make("PadX", { 4 }), + combine(framework::dataset::make("PadY", { 4 }), + combine(framework::dataset::make("KernelSize", 9), + framework::dataset::make("NumKernels", { 16 }))))))); + /** Activation function Dataset*/ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo", { @@ -227,6 +246,14 @@ FIXTURE_DATA_TEST_CASE(RunLarge, NEDirectConvolutionLayerFixture<float>, framewo // Validate output validate(Accessor(_target), _reference, tolerance_fp32); } +FIXTURE_DATA_TEST_CASE(RunLargeUsecase, NEDirectConvolutionLayerFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_usecase, framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + // Validate output + validate(Accessor(_target), _reference, usecase_tolerance_fp32); +} TEST_SUITE_END() // FP32 TEST_SUITE_END() // Float TEST_SUITE_END() // DirectConvolutionLayer |