From d041a835041159a0a6744fc271df15e9f66167bc Mon Sep 17 00:00:00 2001 From: Pablo Tello Date: Wed, 3 Oct 2018 17:11:09 +0100 Subject: COMPMID-1610: Fixed CLDirectConvolution mismatches Kernel size 5x5 layout NHWC. Change-Id: Ia82ff211d1c954df228962b5c2c5ad8df7112449 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/151740 Reviewed-by: Michalis Spyrou Tested-by: bsgcomp --- src/core/CL/cl_kernels/direct_convolution5x5.cl | 62 ++++++++++++++++++++-- .../CL/kernels/CLDirectConvolutionLayerKernel.cpp | 3 +- tests/validation/CL/DirectConvolutionLayer.cpp | 24 ++++----- 3 files changed, 68 insertions(+), 21 deletions(-) diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/direct_convolution5x5.cl index 70be058854..5299409243 100644 --- a/src/core/CL/cl_kernels/direct_convolution5x5.cl +++ b/src/core/CL/cl_kernels/direct_convolution5x5.cl @@ -194,11 +194,11 @@ __kernel void direct_convolution5x5_nhwc( __global uchar *src_addr = (__global uchar *)offset(&src, 0, 0) - src_stride_x * id0 + ((id2 * STRIDE_Y) - PAD_TOP) * (int)src_stride_z; weights_addr += id0 * weights_stride_w; - const int coordy = id2 - PAD_TOP; +#if(PAD_TOP == 1) + const int coordy = id2 - PAD_TOP; for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) { -#if(PAD_TOP) if(coordy < 0) // special case Z = -1 doesn't exists { //skip first row and load the two next ones @@ -224,17 +224,69 @@ __kernel void direct_convolution5x5_nhwc( CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); } -#else //PAD_TOP > 0 + src_addr += src_stride_x; + weights_addr += weights_stride_x; + } +#elif(PAD_TOP == 2) + const int coordy = id2 * STRIDE_Y; + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { + if(coordy == 0) // special case Z = -2 doesn't exists + { + //skip first row and load the two next ones + CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); + } + else if(coordy == 1) // special case Z = -1 doesn't exists + { + //skip first row and load the two next ones + CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); + } + else if(coordy == (SRC_HEIGHT - 1)) + { + // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the + // Z axis has no padding at all. + CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr); + CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); + } + else if(coordy == (SRC_HEIGHT - 2)) + { + // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the + // Z axis has no padding at all. + CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr); + CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); + } + else + { + CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr); + CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); + CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); + } + src_addr += src_stride_x; + weights_addr += weights_stride_x; + } + +#else /* PAD_TOP == 2 */ + for(volatile int d = 0; d < WEIGHTS_DEPTH; ++d) + { CONVOLUTION1x5_NHWC(values0, src_addr, weights_addr); CONVOLUTION1x5_NHWC(values0, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); CONVOLUTION1x5_NHWC(values0, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); CONVOLUTION1x5_NHWC(values0, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); CONVOLUTION1x5_NHWC(values0, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); -#endif // PAD_TOP > 0 - src_addr += src_stride_x; weights_addr += weights_stride_x; } +#endif /* PAD_TOP == 1 */ #ifdef HAS_BIAS Vector biases = CONVERT_TO_VECTOR_STRUCT_NO_STEP(biases); diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index c3d514adb4..471b3209ac 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -421,8 +421,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL } else { - bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type); - + const bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type); build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size))); build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp index e29ec7ef4c..9db73820c2 100644 --- a/tests/validation/CL/DirectConvolutionLayer.cpp +++ b/tests/validation/CL/DirectConvolutionLayer.cpp @@ -44,23 +44,20 @@ namespace validation namespace { // COMPMID-517 Invesitgate the mismatch to see whether it is a real bug -RelativeTolerance tolerance_fp16(half(0.2)); /**< Tolerance for floating point tests */ -RelativeTolerance tolerance_fp32(0.02f); /**< Tolerance for floating point tests */ -constexpr float tolerance_num = 0.07f; /**< Tolerance number */ +RelativeTolerance tolerance_fp16(half(0.2)); /**< Tolerance for floating point tests */ +RelativeTolerance tolerance_fp32(0.02f); /**< Tolerance for floating point tests */ +constexpr float tolerance_num = 0.07f; /**< Tolerance number */ +constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance for quantized tests */ -constexpr AbsoluteTolerance tolerance_qasymm8(1); /**< Tolerance for quantized tests */ +const auto data_strides = combine(framework::dataset::make("StrideX", 1, 3), framework::dataset::make("StrideY", 1, 3)); +const auto data_ksize_one = combine(framework::dataset::make("PadX", 0, 1), combine(framework::dataset::make("PadY", 0, 1), framework::dataset::make("KernelSize", 1))); +const auto data_ksize_three = combine(framework::dataset::make("PadX", 0, 2), combine(framework::dataset::make("PadY", 0, 2), framework::dataset::make("KernelSize", 3))); +const auto data_ksize_five = combine(framework::dataset::make("PadX", 0, 3), combine(framework::dataset::make("PadY", 0, 3), framework::dataset::make("KernelSize", 5))); +const auto data_all_kernels = concat(concat(data_ksize_one, data_ksize_three), data_ksize_five); /** Direct convolution data set. */ const auto data = combine(datasets::SmallDirectConvolutionShapes(), - combine(framework::dataset::make("StrideX", 1, 3), - combine(framework::dataset::make("StrideY", 1, 3), - combine(concat(combine(framework::dataset::make("PadX", 0, 1), - combine(framework::dataset::make("PadY", 0, 1), - framework::dataset::make("KernelSize", 1))), - combine(framework::dataset::make("PadX", 0, 2), - combine(framework::dataset::make("PadY", 0, 2), - framework::dataset::make("KernelSize", { 3, 5 })))), - framework::dataset::make("NumKernels", { 1, 4, 8, 16 }))))); + combine(data_strides, combine(data_all_kernels, framework::dataset::make("NumKernels", { 1, 4, 8, 16 })))); /** Activation function Dataset*/ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo", @@ -174,7 +171,6 @@ FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionLayerFixture, framework::D ActivationFunctionsDataset), framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) { - // Validate output validate(CLAccessor(_target), _reference, tolerance_fp32); } TEST_SUITE_END() -- cgit v1.2.1