From 0841ca085301e8ddbc9627b2be55758b66437c15 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Mon, 1 Feb 2021 14:37:02 +0000 Subject: Fix OpenCL direct convolution - The ARM DOT macro was using wrong variables for performing the dot product - K0 could be a non power of 2 values when IFM was not a multiple of 16 - Refactor the test for direct convolution NHWC Resolves COMPMID-4135, COMPMID-4155 Change-Id: I3a2dc89ef613ae20245cfc28e76ea36c55eaf81d Signed-off-by: Gian Marco Iodice Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/4962 Comments-Addressed: Arm Jenkins Reviewed-by: TeresaARM Tested-by: Arm Jenkins --- src/core/CL/cl_kernels/direct_convolution.cl | 34 ++- .../CL/kernels/CLDirectConvolutionLayerKernel.cpp | 2 +- tests/datasets/ShapeDatasets.h | 2 +- tests/validation/CL/DirectConvolutionLayer.cpp | 232 ++++++++++++++++----- 4 files changed, 193 insertions(+), 77 deletions(-) diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/direct_convolution.cl index 87f8153118..5d2a24e740 100644 --- a/src/core/CL/cl_kernels/direct_convolution.cl +++ b/src/core/CL/cl_kernels/direct_convolution.cl @@ -34,24 +34,24 @@ #else // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) #define ARM_DOT(x, y, val) \ ({ \ - val += (ACC_DATA_TYPE)a.s0 * (ACC_DATA_TYPE)b.s0; \ - val += (ACC_DATA_TYPE)a.s1 * (ACC_DATA_TYPE)b.s1; \ - val += (ACC_DATA_TYPE)a.s2 * (ACC_DATA_TYPE)b.s2; \ - val += (ACC_DATA_TYPE)a.s3 * (ACC_DATA_TYPE)b.s3; \ + val += (ACC_DATA_TYPE)x.s0 * (ACC_DATA_TYPE)y.s0; \ + val += (ACC_DATA_TYPE)x.s1 * (ACC_DATA_TYPE)y.s1; \ + val += (ACC_DATA_TYPE)x.s2 * (ACC_DATA_TYPE)y.s2; \ + val += (ACC_DATA_TYPE)x.s3 * (ACC_DATA_TYPE)y.s3; \ }) #endif // defined(ARM_COMPUTE_OPENCL_DOT8_ACC_ENABLED) && defined(cl_arm_integer_dot_product_accumulate_int8) -#define ARM_DOT1(a, b, c) \ - ({ \ - ARM_DOT((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 3))0), (VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 3))0), c); \ +#define ARM_DOT1(a, b, c) \ + ({ \ + ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 3))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 3))0)), c); \ }) -#define ARM_DOT2(a, b, c) \ - ({ \ - ARM_DOT((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 2))0), (VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 2))0), c); \ +#define ARM_DOT2(a, b, c) \ + ({ \ + ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (VEC_DATA_TYPE(SRC_DATA_TYPE, 2))0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (VEC_DATA_TYPE(WEI_DATA_TYPE, 2))0)), c); \ }) -#define ARM_DOT3(a, b, c) \ - ({ \ - ARM_DOT((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (SRC_DATA_TYPE)0), (VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (WEI_DATA_TYPE)0), c); \ +#define ARM_DOT3(a, b, c) \ + ({ \ + ARM_DOT(((VEC_DATA_TYPE(SRC_DATA_TYPE, 4))(a, (SRC_DATA_TYPE)0)), ((VEC_DATA_TYPE(WEI_DATA_TYPE, 4))(b, (WEI_DATA_TYPE)0)), c); \ }) #define ARM_DOT4(a, b, c) \ ({ \ @@ -539,14 +539,12 @@ __kernel void direct_convolution_nhwc( TENSOR_DOT(K0, 0); -#undef TENSOR_DOT - wei_offset += K0 * sizeof(WEI_DATA_TYPE); } #if(SRC_CHANNELS % K0) != 0 // Left-over accumulations - for(; i < SRC_CHANNELS; ++i) + for(; k < SRC_CHANNELS; ++k) { // Load values from src tensor LOAD_BLOCK_INDIRECT(M0, 1, SRC_DATA_TYPE, a, src_ptr, src_offset + k * sizeof(SRC_DATA_TYPE), src_stride_y, mi_valid_row, mi_mask); @@ -554,10 +552,6 @@ __kernel void direct_convolution_nhwc( // Load values from weights tensor LOAD_BLOCK(N0, 1, WEI_DATA_TYPE, b, wei_ptr, wei_offset, wei_stride_w, zero); -#define TENSOR_DOT(i) \ - ARM_DOT_K0XN0(1, a##i, b, c##i); \ - ARM_OFFSET_K0XN0(1, a##i, b, SRC_OFFSET, WEI_OFFSET, c##i); - TENSOR_DOT(1, 0); #undef TENSOR_DOT diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index 3ca72b3e5d..3948261bc3 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -375,7 +375,7 @@ void CLDirectConvolutionLayerKernel::configure(const CLCompileContext &compile_c const unsigned int n0 = win_config.second.x().step(); const unsigned int m0 = win_config.second.y().step(); - const unsigned int k0 = std::min(static_cast(_input->info()->dimension(channel_idx)), 16u); + const unsigned int k0 = adjust_vec_size(16u, _input->info()->dimension(channel_idx)); const unsigned int partial_store_n0 = _output->info()->dimension(channel_idx) % n0; const unsigned int partial_store_m0 = (_output->info()->dimension(width_idx) * _output->info()->dimension(height_idx)) % m0; const unsigned int pad_left = conv_info.pad_left(); diff --git a/tests/datasets/ShapeDatasets.h b/tests/datasets/ShapeDatasets.h index f8c0309d04..a7f1a44286 100644 --- a/tests/datasets/ShapeDatasets.h +++ b/tests/datasets/ShapeDatasets.h @@ -683,7 +683,7 @@ public: // Batch size 1 TensorShape{ 32U, 37U, 3U }, // Batch size 4 - TensorShape{ 32U, 37U, 3U, 4U }, + TensorShape{ 6U, 9U, 5U, 4U }, }) { } diff --git a/tests/validation/CL/DirectConvolutionLayer.cpp b/tests/validation/CL/DirectConvolutionLayer.cpp index ae2f22dd1e..3a6cacc0ba 100644 --- a/tests/validation/CL/DirectConvolutionLayer.cpp +++ b/tests/validation/CL/DirectConvolutionLayer.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -161,14 +161,178 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(zip(zip( bool is_valid = bool(CLDirectConvolutionLayer::validate(&input_info.clone()->set_is_resizable(false), &weights_info.clone()->set_is_resizable(false), &biases_info.clone()->set_is_resizable(false), &output_info.clone()->set_is_resizable(false), conv_info, act_info)); ARM_COMPUTE_EXPECT(is_valid == expected, framework::LogLevel::ERRORS); } -// clang-format on -// *INDENT-ON* template using CLDirectConvolutionLayerFixture = DirectConvolutionValidationFixture; template using CLDirectConvolutionValidationWithTensorShapesFixture = DirectConvolutionValidationWithTensorShapesFixture; +template +using CLDirectConvolutionLayerQuantizedFixture = DirectConvolutionValidationQuantizedFixture; +template +using CLDirectConvolutionValidationWithTensorShapesQuantizedFixture = DirectConvolutionValidationWithTensorShapesQuantizedFixture; + +TEST_SUITE(NHWC) +TEST_SUITE(FP16) +FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U), + TensorShape(9U, 5U, 6U, 4U), + TensorShape(3U, 5U, 7U, 2U), + TensorShape(32U, 37U, 3U) } ), + framework::dataset::make("StrideX", { 1, 3, 1, 1 })), + framework::dataset::make("StrideY", { 1, 3, 2, 1 })), + framework::dataset::make("PadX", { 1, 3, 0, 4 })), + framework::dataset::make("PadY", { 1, 3, 0, 4 })), + framework::dataset::make("KernelSize", { 3, 8, 1, 9 })), + framework::dataset::make("NumKernels", { 7, 3, 1, 3 })), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num); +} +FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ), + framework::dataset::make("StrideX", { 1 })), + framework::dataset::make("StrideY", { 1 })), + framework::dataset::make("PadX", { 1 })), + framework::dataset::make("PadY", { 1 })), + framework::dataset::make("KernelSize", { 9 })), + framework::dataset::make("NumKernels", { 3 })), + framework::dataset::make("DataType", DataType::F16)), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::IDENTITY) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num); +} + +TEST_SUITE_END() // FP16 + +TEST_SUITE(FP32) +FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U), + TensorShape(9U, 5U, 6U, 4U), + TensorShape(3U, 5U, 7U, 2U), + TensorShape(32U, 37U, 3U) } ), + framework::dataset::make("StrideX", { 1, 3, 1, 1 })), + framework::dataset::make("StrideY", { 1, 3, 2, 1 })), + framework::dataset::make("PadX", { 1, 3, 0, 4 })), + framework::dataset::make("PadY", { 1, 3, 0, 4 })), + framework::dataset::make("KernelSize", { 3, 8, 1, 9 })), + framework::dataset::make("NumKernels", { 7, 3, 1, 3 })), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_fp32); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ), + framework::dataset::make("StrideX", { 1 })), + framework::dataset::make("StrideY", { 1 })), + framework::dataset::make("PadX", { 1 })), + framework::dataset::make("PadY", { 1 })), + framework::dataset::make("KernelSize", { 9 })), + framework::dataset::make("NumKernels", { 3 })), + framework::dataset::make("DataType", DataType::F32)), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::IDENTITY) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_fp32); +} + +TEST_SUITE_END() // FP32 + +TEST_SUITE(Quantized) +TEST_SUITE(QASYMM8) +FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U), + TensorShape(9U, 5U, 6U, 4U), + TensorShape(3U, 5U, 7U, 2U), + TensorShape(32U, 37U, 3U) } ), + framework::dataset::make("StrideX", { 1, 3, 1, 1 })), + framework::dataset::make("StrideY", { 1, 3, 2, 1 })), + framework::dataset::make("PadX", { 1, 3, 0, 4 })), + framework::dataset::make("PadY", { 1, 3, 0, 4 })), + framework::dataset::make("KernelSize", { 3, 8, 1, 9 })), + framework::dataset::make("NumKernels", { 7, 3, 1, 3 })), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("QuantizationInfo", QuantizationInfo(1.1f / 255, 10))), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ), + framework::dataset::make("StrideX", { 1 })), + framework::dataset::make("StrideY", { 1 })), + framework::dataset::make("PadX", { 1 })), + framework::dataset::make("PadY", { 1 })), + framework::dataset::make("KernelSize", { 9 })), + framework::dataset::make("NumKernels", { 3 })), + framework::dataset::make("DataType", DataType::QASYMM8)), + framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255, 10))), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} + +TEST_SUITE_END() // QASYMM8 +// +TEST_SUITE(QASYMM8_SIGNED) +FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture, framework::DatasetMode::PRECOMMIT, + combine(combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(27U, 13U, 2U), + TensorShape(9U, 5U, 6U, 4U), + TensorShape(3U, 5U, 7U, 2U), + TensorShape(32U, 37U, 3U) } ), + framework::dataset::make("StrideX", { 1, 3, 1, 1 })), + framework::dataset::make("StrideY", { 1, 3, 2, 1 })), + framework::dataset::make("PadX", { 1, 3, 0, 4 })), + framework::dataset::make("PadY", { 1, 3, 0, 4 })), + framework::dataset::make("KernelSize", { 3, 8, 1, 9 })), + framework::dataset::make("NumKernels", { 7, 3, 1, 3 })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255, 10))), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} + +FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerQuantizedFixture, framework::DatasetMode::NIGHTLY, + combine(combine(combine(combine(zip(zip(zip(zip(zip(zip( + framework::dataset::make("InputShape", { TensorShape(800U, 800U, 3U) } ), + framework::dataset::make("StrideX", { 1 })), + framework::dataset::make("StrideY", { 1 })), + framework::dataset::make("PadX", { 1 })), + framework::dataset::make("PadY", { 1 })), + framework::dataset::make("KernelSize", { 9 })), + framework::dataset::make("NumKernels", { 3 })), + framework::dataset::make("DataType", DataType::QASYMM8_SIGNED)), + framework::dataset::make("QuantizationInfo", QuantizationInfo(2.f / 255, 10))), + framework::dataset::make("ActivationInfo", ActivationLayerInfo(ActivationLayerInfo::ActivationFunction::RELU) )), + framework::dataset::make("DataLayout", DataLayout::NHWC))) +{ + validate(CLAccessor(_target), _reference, tolerance_qasymm8); +} +TEST_SUITE_END() // QASYMM8_SIGNED +TEST_SUITE_END() // Quantized +TEST_SUITE_END() // NHWC + +// clang-format on +// *INDENT-ON* +TEST_SUITE(NCHW) TEST_SUITE(Float) TEST_SUITE(FP16) FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit, framework::dataset::make("DataType", DataType::F16)), @@ -185,59 +349,22 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerFixture, framewor // Validate output validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num); } -FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_9x9, framework::dataset::make("DataType", - DataType::F16)), - ActivationFunctionsDataset), - framework::dataset::make("DataLayout", { DataLayout::NHWC }))) -{ - validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num); -} -FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit_9x9, framework::dataset::make("DataType", - DataType::F16)), - ActivationFunctionsDataset), - framework::dataset::make("DataLayout", { DataLayout::NHWC }))) -{ - validate(CLAccessor(_target), _reference, tolerance_fp16, tolerance_num); -} TEST_SUITE_END() // FP16 TEST_SUITE(FP32) FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit, framework::dataset::make("DataType", DataType::F32)), ActivationFunctionsDataset), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) { validate(CLAccessor(_target), _reference, tolerance_fp32); } FIXTURE_DATA_TEST_CASE(RunLarge, CLDirectConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly, framework::dataset::make("DataType", DataType::F32)), ActivationFunctionsDataset), - framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC }))) -{ - validate(CLAccessor(_target), _reference, tolerance_fp32); -} -FIXTURE_DATA_TEST_CASE(RunLarge9x9, CLDirectConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_9x9, framework::dataset::make("DataType", - DataType::F32)), - ActivationFunctionsDataset), - framework::dataset::make("DataLayout", { DataLayout::NHWC }))) + framework::dataset::make("DataLayout", { DataLayout::NCHW }))) { validate(CLAccessor(_target), _reference, tolerance_fp32); } -FIXTURE_DATA_TEST_CASE(RunSmall9x9, CLDirectConvolutionLayerFixture, framework::DatasetMode::PRECOMMIT, combine(combine(combine(data_precommit_9x9, framework::dataset::make("DataType", - DataType::F32)), - ActivationFunctionsDataset), - framework::dataset::make("DataLayout", { DataLayout::NHWC }))) -{ - validate(CLAccessor(_target), _reference, tolerance_fp32); -} - -FIXTURE_DATA_TEST_CASE(RunLargeUsecase, CLDirectConvolutionLayerFixture, framework::DatasetMode::NIGHTLY, combine(combine(combine(data_nightly_usecase, framework::dataset::make("DataType", - DataType::F32)), - framework::dataset::make("ActivationInfo", { ActivationLayerInfo() })), - framework::dataset::make("DataLayout", { DataLayout::NHWC }))) -{ - // Validate output - validate(CLAccessor(_target), _reference, tolerance_fp32, 0.f, tolerance_fp32_abs); -} TEST_SUITE_END() // FP32 TEST_SUITE(FP32_CustomDataset) @@ -251,11 +378,6 @@ FIXTURE_DATA_TEST_CASE(Run, CLDirectConvolutionValidationWithTensorShapesFixture TEST_SUITE_END() // FP32_CustomDataset TEST_SUITE_END() // Float -template -using CLDirectConvolutionLayerQuantizedFixture = DirectConvolutionValidationQuantizedFixture; -template -using CLDirectConvolutionValidationWithTensorShapesQuantizedFixture = DirectConvolutionValidationWithTensorShapesQuantizedFixture; - const auto QuantizedActivationFunctionsDataset = framework::dataset::make("ActivationInfo", { ActivationLayerInfo(), @@ -269,7 +391,7 @@ FIXTURE_DATA_TEST_CASE(RunSmall, CLDirectConvolutionLayerQuantizedFixture