From ba2cc1aea6bcd16b3ad81b55be18911af83d2113 Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 15 Jul 2020 17:39:30 +0100 Subject: COMPMID-3577: 9x9 CLDirectConvolution failures Change-Id: I32588332080adfaa79227dadd0f152c1bd67ff62 Signed-off-by: Michele Di Giorgio Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3577 Tested-by: Arm Jenkins Comments-Addressed: Arm Jenkins Reviewed-by: Georgios Pinitas --- src/core/CL/cl_kernels/direct_convolution9x9.cl | 142 +++++------------------- tests/validation/CL/DirectConvolutionLayer.cpp | 21 +++- 2 files changed, 49 insertions(+), 114 deletions(-) diff --git a/src/core/CL/cl_kernels/direct_convolution9x9.cl b/src/core/CL/cl_kernels/direct_convolution9x9.cl index d0f635c6fa..64da38d64d 100644 --- a/src/core/CL/cl_kernels/direct_convolution9x9.cl +++ b/src/core/CL/cl_kernels/direct_convolution9x9.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2020 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,7 +25,7 @@ #undef CONVERT_SAT -#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) +#if defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) && defined(PAD_TOP) #define PTR_TO_VALUE(PTR, DATA_TYPE) *((__global DATA_TYPE *)(PTR)) @@ -288,103 +288,38 @@ __kernel void direct_convolution9x9_nhwc( weights_addr += id0 * weights_stride_w; -#if(PAD_TOP == 1) - const int coordy = id2 - PAD_TOP; - for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X) + const int coordy = (id2 * STRIDE_Y) - PAD_TOP; + if(coordy < 0) { - if(coordy < 0) // special case Z = -1 doesn't exists + // Skip first rows containing padding + for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X) { - //skip first row and load the two next ones - CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z)); - } - else if(coordy == (DST_HEIGHT - PAD_TOP - 1)) - { - // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the - // Z axis has no padding at all. - CONVOLUTION1x9_NHWC(values, src_addr, weights_addr); - CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); + const int start_z = -coordy; + for(int i = start_z; i < 9; ++i) + { + CONVOLUTION1x9_NHWC(values, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z)); + } + src_addr += STEP_X * sizeof(DATA_TYPE); + weights_addr += STEP_X * sizeof(DATA_TYPE); } - else - { - CONVOLUTION1x9_NHWC(values, src_addr, weights_addr); - CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z)); - } - src_addr += STEP_X * sizeof(DATA_TYPE); - weights_addr += STEP_X * sizeof(DATA_TYPE); } -#elif(PAD_TOP == 2) // PAD_TOP == 1 - const int coordy = id2 * STRIDE_Y; - for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X) + else if(coordy > (SRC_HEIGHT - 9)) { - if(coordy == 0) // special case Z = -2 doesn't exists - { - //skip first row and load the two next ones - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z)); - } - else if(coordy == 1) // special case Z = -1 doesn't exists + for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X) { - //skip first row and load the two next ones - CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z)); + // Avoid loading rows beyond the input height + const int end_z = SRC_HEIGHT - coordy; + for(int i = 0; i < end_z; ++i) + { + CONVOLUTION1x9_NHWC(values, (src_addr + i * (int)src_stride_z), (weights_addr + i * (int)weights_stride_z)); + } + src_addr += STEP_X * sizeof(DATA_TYPE); + weights_addr += STEP_X * sizeof(DATA_TYPE); } - else if(coordy == (SRC_HEIGHT - 5)) - { - // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the - // Z axis has no padding at all. - CONVOLUTION1x9_NHWC(values, src_addr, weights_addr); - CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - } - else if(coordy == (SRC_HEIGHT - 6)) - { - // special case when computing the last row of the output we must read the last three rows from the input buffer (including padding) but the - // Z axis has no padding at all. - CONVOLUTION1x9_NHWC(values, src_addr, weights_addr); - CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); - } - else + } + else + { + for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X) { CONVOLUTION1x9_NHWC(values, src_addr, weights_addr); CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); @@ -395,27 +330,10 @@ __kernel void direct_convolution9x9_nhwc( CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z)); + src_addr += STEP_X * sizeof(DATA_TYPE); + weights_addr += STEP_X * sizeof(DATA_TYPE); } - src_addr += STEP_X * sizeof(DATA_TYPE); - weights_addr += STEP_X * sizeof(DATA_TYPE); - } - -#else // PAD_TOP == 1 - for(volatile int d = 0; d < WEIGHTS_DEPTH; d += STEP_X) - { - CONVOLUTION1x9_NHWC(values, src_addr, weights_addr); - CONVOLUTION1x9_NHWC(values, (src_addr + 1 * (int)src_stride_z), (weights_addr + 1 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 2 * (int)src_stride_z), (weights_addr + 2 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 3 * (int)src_stride_z), (weights_addr + 3 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 4 * (int)src_stride_z), (weights_addr + 4 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 5 * (int)src_stride_z), (weights_addr + 5 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 6 * (int)src_stride_z), (weights_addr + 6 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 7 * (int)src_stride_z), (weights_addr + 7 * (int)weights_stride_z)); - CONVOLUTION1x9_NHWC(values, (src_addr + 8 * (int)src_stride_z), (weights_addr + 8 * (int)weights_stride_z)); - src_addr += STEP_X * sizeof(DATA_TYPE); - weights_addr += STEP_X * sizeof(DATA_TYPE); } -#endif // PAD_TOP == 1 #if defined(VEC_SIZE) REDUCE(values.s0, values0); @@ -443,4 +361,4 @@ __kernel void direct_convolution9x9_nhwc( *((__global DATA_TYPE *)(dst.ptr + 7 * dst_stride_y)) = values.s7; #undef STEP_X } -#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) +#endif // defined(DATA_TYPE) && defined(STRIDE_X) && defined(WEIGHTS_DEPTH) && defined(DATA_LAYOUT_NHWC) && defined(PAD_TOP) diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp index b1457abc8b..94a436ac6f 100644 --- a/tests/validation/CL/DirectConvolutionLayer.cpp +++ b/tests/validation/CL/DirectConvolutionLayer.cpp @@ -66,8 +66,16 @@ const auto data_small = combine(datasets::SmallDirectConvolutionShapes(), com const auto data_small9x9 = combine(datasets::SmallDirectConvolutionShapes(), combine(data_strides_small, data_ksize_nine_small)); /** Direct convolution nightly data set. */ -const auto data_nightly = combine(data, framework::dataset::make("NumKernels", { 1, 4 })); -const auto data_nightly_9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 1, 4 })); +const auto data_nightly = combine(data, framework::dataset::make("NumKernels", { 1, 4 })); +const auto data_nightly_9x9 = combine(data9x9, framework::dataset::make("NumKernels", { 1, 4 })); +const auto data_nightly_usecase = combine(framework::dataset::make("InputShape", { TensorShape{ 3U, 800U, 800U } }), + combine(framework::dataset::make("StrideX", { 1 }), + combine(framework::dataset::make("StrideY", { 1 }), + combine(framework::dataset::make("PadX", { 4 }), + combine(framework::dataset::make("PadY", { 4 }), + combine(framework::dataset::make("KernelSize", 9), + framework::dataset::make("NumKernels", { 16 }))))))); + /** Direct convolution precommit data set. */ const auto data_precommit = combine(data_small, framework::dataset::make("NumKernels", { 1 })); const auto data_precommit_9x9 = combine(data_small9x9, framework::dataset::make("NumKernels", { 1 })); @@ -223,6 +231,15 @@ FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerFixture, fram { validate(CLAccessor(_target), _reference, tolerance_fp32); } + +FIXTURE_DATA_TEST_CASE(RunLargeUsecase, CLDirectConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_usecase, framework::dataset::make("DataType", + DataType::F32)), + framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), + framework::dataset::make("DataLayout", { DataLayout::NHWC }))) +{ + // Validate output + validate(CLAccessor(_target), _reference, tolerance_fp32); +} TEST_SUITE_END() // FP32 TEST_SUITE(FP32_CustomDataset) -- cgit v1.2.1