diff options
Diffstat (limited to 'examples/gemm_tuner')
-rw-r--r-- | examples/gemm_tuner/CommonGemmExampleOptions.cpp | 16 | ||||
-rw-r--r-- | examples/gemm_tuner/CommonGemmExampleOptions.h | 28 | ||||
-rw-r--r-- | examples/gemm_tuner/GemmTuner.py | 2 | ||||
-rw-r--r-- | examples/gemm_tuner/GemmTunerHelpers.h | 6 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemm_native.cpp | 28 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemm_reshaped.cpp | 88 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp | 66 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemmlowp_reshaped.cpp | 109 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp | 110 |
9 files changed, 243 insertions, 210 deletions
diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.cpp b/examples/gemm_tuner/CommonGemmExampleOptions.cpp index bee202b99e..c2a465604a 100644 --- a/examples/gemm_tuner/CommonGemmExampleOptions.cpp +++ b/examples/gemm_tuner/CommonGemmExampleOptions.cpp @@ -39,7 +39,8 @@ using namespace utils; return os; } -CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, arm_compute::DataType default_data_type) +CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, + arm_compute::DataType default_data_type) : help(parser.add_option<ToggleOption>("help")), M(parser.add_positional_option<SimpleOption<size_t>>("M", 100)), N(parser.add_positional_option<SimpleOption<size_t>>("N", 100)), @@ -48,21 +49,16 @@ CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLi data_type(), tuner_mode() { - const std::set<DataType> supported_data_types - { + const std::set<DataType> supported_data_types{ DataType::F16, DataType::F32, DataType::QASYMM8, }; - const std::set<CLTunerMode> supported_tuner_modes - { - CLTunerMode::EXHAUSTIVE, - CLTunerMode::NORMAL, - CLTunerMode::RAPID - }; + const std::set<CLTunerMode> supported_tuner_modes{CLTunerMode::EXHAUSTIVE, CLTunerMode::NORMAL, CLTunerMode::RAPID}; - ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(), "Default data type unsupported"); + ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(), + "Default data type unsupported"); data_type = parser.add_option<EnumOption<DataType>>("type", supported_data_types, default_data_type); tuner_mode = parser.add_option<EnumOption<CLTunerMode>>("tuner-mode", supported_tuner_modes, CLTunerMode::RAPID); diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.h b/examples/gemm_tuner/CommonGemmExampleOptions.h index f7447e3db3..38178bcef8 100644 --- a/examples/gemm_tuner/CommonGemmExampleOptions.h +++ b/examples/gemm_tuner/CommonGemmExampleOptions.h @@ -27,21 +27,22 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "utils/TypePrinter.h" + #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/TypePrinter.h" namespace gemm_tuner { /** Structure holding all the common gemm example parameters */ struct CommonGemmExampleParams { - size_t M{ 100 }; /**< Number of lhs matrix rows */ - size_t N{ 100 }; /**< Number of rhs matrix columns */ - size_t K{ 50 }; /**< Number of lhs matrix columns/rhs matrix rows */ - size_t B{ 1 }; /**< Batch size */ - arm_compute::DataType data_type{ arm_compute::DataType::F32 }; /**< Data type */ - arm_compute::CLTunerMode tuner_mode{ arm_compute::CLTunerMode::RAPID }; /**< OpenCL tuner mode */ + size_t M{100}; /**< Number of lhs matrix rows */ + size_t N{100}; /**< Number of rhs matrix columns */ + size_t K{50}; /**< Number of lhs matrix columns/rhs matrix rows */ + size_t B{1}; /**< Batch size */ + arm_compute::DataType data_type{arm_compute::DataType::F32}; /**< Data type */ + arm_compute::CLTunerMode tuner_mode{arm_compute::CLTunerMode::RAPID}; /**< OpenCL tuner mode */ }; /** Formatted output of the CommonGemmExampleParams type @@ -70,7 +71,8 @@ public: * @param[in,out] parser A parser on which "parse()" hasn't been called yet. * @param[in] default_data_type Default data type if unspecified. */ - CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, arm_compute::DataType default_data_type = arm_compute::DataType::F32); + CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, + arm_compute::DataType default_data_type = arm_compute::DataType::F32); /** Prevent instances of this class from being copied (As this class contains pointers) */ CommonGemmExampleOptions(const CommonGemmExampleOptions &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -82,11 +84,11 @@ public: /** Default destructor */ ~CommonGemmExampleOptions() = default; - arm_compute::utils::ToggleOption *help; /**< Show help option */ - arm_compute::utils::SimpleOption<size_t> *M; /**< Number of lhs matrix rows option */ - arm_compute::utils::SimpleOption<size_t> *N; /**< Number of rhs matrix columns option */ - arm_compute::utils::SimpleOption<size_t> *K; /**< Number of lhs matrix columns/rhs matrix rows option */ - arm_compute::utils::SimpleOption<size_t> *B; /**< Batch size option */ + arm_compute::utils::ToggleOption *help; /**< Show help option */ + arm_compute::utils::SimpleOption<size_t> *M; /**< Number of lhs matrix rows option */ + arm_compute::utils::SimpleOption<size_t> *N; /**< Number of rhs matrix columns option */ + arm_compute::utils::SimpleOption<size_t> *K; /**< Number of lhs matrix columns/rhs matrix rows option */ + arm_compute::utils::SimpleOption<size_t> *B; /**< Batch size option */ arm_compute::utils::EnumOption<arm_compute::DataType> *data_type; /**< Data type */ arm_compute::utils::EnumOption<arm_compute::CLTunerMode> *tuner_mode; /**< OpenCL tuner mode */ }; diff --git a/examples/gemm_tuner/GemmTuner.py b/examples/gemm_tuner/GemmTuner.py index 3e75051ffc..ef1f31493e 100644 --- a/examples/gemm_tuner/GemmTuner.py +++ b/examples/gemm_tuner/GemmTuner.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020 ARM Limited. +# Copyright (c) 2019-2020 Arm Limited. # # SPDX-License-Identifier: MIT # diff --git a/examples/gemm_tuner/GemmTunerHelpers.h b/examples/gemm_tuner/GemmTunerHelpers.h index ae5cfbb19e..dbff9e2dff 100644 --- a/examples/gemm_tuner/GemmTunerHelpers.h +++ b/examples/gemm_tuner/GemmTunerHelpers.h @@ -36,9 +36,9 @@ bool update_padding_for_cl_image(arm_compute::ITensorInfo *tensor) constexpr unsigned int num_floats_per_pixel = 4; const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size(); - const unsigned int pixel_aligment = arm_compute::get_cl_image_pitch_alignment( - arm_compute::CLKernelLibrary::get().get_device()); - if(pixel_aligment == 0) + const unsigned int pixel_aligment = + arm_compute::get_cl_image_pitch_alignment(arm_compute::CLKernelLibrary::get().get_device()); + if (pixel_aligment == 0) { return false; } diff --git a/examples/gemm_tuner/cl_gemm_native.cpp b/examples/gemm_tuner/cl_gemm_native.cpp index 093935f716..7daa0b07d3 100644 --- a/examples/gemm_tuner/cl_gemm_native.cpp +++ b/examples/gemm_tuner/cl_gemm_native.cpp @@ -25,19 +25,20 @@ #error "This example needs to be built with -DARM_COMPUTE_CL" #endif /* ARM_COMPUTE_CL */ -#include "CommonGemmExampleOptions.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" + +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" +#include "CommonGemmExampleOptions.h" #include <cstdlib> using namespace arm_compute; @@ -51,9 +52,9 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ }; /** Formatted output of the GemmConfigs type @@ -145,13 +146,13 @@ public: // Parse command line options parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { // Print help message parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -198,8 +199,9 @@ public: // Validate argments Status status{}; - status = gemm.validate(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); - if(!status) + status = gemm.validate(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, + kernel_info); + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -221,11 +223,7 @@ public: void do_run() override { // Execute the function - ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, - { ACL_SRC_1, &rhs }, - { ACL_SRC_2, &bias }, - { ACL_DST, &dst } - }); + ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}}); gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: diff --git a/examples/gemm_tuner/cl_gemm_reshaped.cpp b/examples/gemm_tuner/cl_gemm_reshaped.cpp index e6caeec873..75f3539cb9 100644 --- a/examples/gemm_tuner/cl_gemm_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemm_reshaped.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,14 +31,15 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "examples/gemm_tuner/CommonGemmExampleOptions.h" #include "examples/gemm_tuner/GemmTunerHelpers.h" -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" #include <cstdlib> @@ -53,16 +54,16 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ true }; /**< Interleave lhs matrix */ - bool transpose_lhs{ true }; /**< Transpose lhs matrix. */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix. */ - bool export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image. */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{true}; /**< Interleave lhs matrix */ + bool transpose_lhs{true}; /**< Transpose lhs matrix. */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix. */ + bool export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image. */ }; /** Formatted output of the GemmConfigs type @@ -119,8 +120,10 @@ public: // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)"); - export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); + transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do " + "transpose lhs matrix (0)"); + export_to_cl_image_rhs->set_help( + "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); } /** Prevent instances of this class from being copied (As this class contains pointers) */ GemmConfigOptions(const GemmConfigOptions &) = delete; @@ -133,17 +136,18 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ + SimpleOption<size_t> * + transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/ }; @@ -198,13 +202,13 @@ public: // Parse command line options parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { // Print help message parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -256,15 +260,22 @@ public: kernel_info.broadcast_bias = true; kernel_info.activation_info = act_info; + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U); + } + // Initialise lhs_reshaped tensor info - lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); + lhs_reshaped.allocator()->init( + TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); // Initialise rhs_reshaped tensor info - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; @@ -274,7 +285,7 @@ public: // Validate argments Status status{}; status = reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, kernel_info.reinterpret_input_as_3d); - if(!status) + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -282,8 +293,9 @@ public: return false; } - status = gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); - if(!status) + status = gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, + rhs_info, kernel_info); + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -295,7 +307,8 @@ public: reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); // Configure function - gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, + rhs_info, kernel_info); // Allocate tensors lhs.allocator()->allocate(); @@ -310,15 +323,12 @@ public: void do_run() override { // Execute the functions - ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } }); + ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}}); reshape_lhs.run(reshape_lsh_pack); - ITensorPack gemm_pack({ { ACL_SRC_0, &lhs_reshaped }, - { ACL_SRC_1, &rhs_reshaped }, - { ACL_SRC_2, &bias }, - { ACL_DST, &dst } - }); - reshape_lhs.run(gemm_pack); + ITensorPack gemm_pack( + {{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); diff --git a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp index dbaaca6048..cfea2c9bac 100644 --- a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp +++ b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,20 +25,21 @@ #error "This example needs to be built with -DARM_COMPUTE_CL" #endif /* ARM_COMPUTE_CL */ -#include "CommonGemmExampleOptions.h" -#include "GemmTunerHelpers.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" + +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" +#include "CommonGemmExampleOptions.h" +#include "GemmTunerHelpers.h" #include <cstdlib> using namespace arm_compute; @@ -52,13 +53,13 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix */ - bool export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image.*/ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix */ + bool export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image.*/ }; /** Formatted output of the GemmConfigs type @@ -106,7 +107,8 @@ public: h0->set_help("Number of horizontal blocks of size (k0xn0) stored on the same output row"); interleave_rhs->set_help("Interleave rhs matrix (1) / Do not interleave rhs matrix (0)"); transpose_rhs->set_help("Transpose rhs matrix (1) / Do not transpose rhs matrix (0)"); - export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); + export_to_cl_image_rhs->set_help( + "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); } /** Prevent instances of this class from being copied (As this class contains pointers) */ GemmConfigOptions(const GemmConfigOptions &) = delete; @@ -119,10 +121,10 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/ @@ -170,13 +172,13 @@ public: // Parse command line options parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { // Print help message parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -225,12 +227,18 @@ public: kernel_info.broadcast_bias = true; kernel_info.activation_info = act_info; + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U); + } + // Initialise rhs_reshaped tensor info - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; @@ -239,8 +247,9 @@ public: // Validate argments Status status{}; - status = gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); - if(!status) + status = gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, + rhs_info, kernel_info); + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -249,7 +258,8 @@ public: } // Configure function - gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, + kernel_info); // Allocate tensors lhs.allocator()->allocate(); @@ -263,11 +273,7 @@ public: void do_run() override { // Execute the function - ITensorPack gemm_pack({ { ACL_SRC_0, &lhs }, - { ACL_SRC_1, &rhs_reshaped }, - { ACL_SRC_2, &bias }, - { ACL_DST, &dst } - }); + ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}}); gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp index 3d3f7fef1e..3808b98b7d 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,14 +31,15 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "examples/gemm_tuner/CommonGemmExampleOptions.h" #include "examples/gemm_tuner/GemmTunerHelpers.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" -#include "src/core/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" #include <cstdlib> @@ -53,15 +54,15 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ true }; /**< Interleave lhs matrix */ - bool transpose_lhs{ true }; /**< Transpose lhs matrix. */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix. */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{true}; /**< Interleave lhs matrix */ + bool transpose_lhs{true}; /**< Transpose lhs matrix. */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix. */ }; /** Formatted output of the GemmConfigs type @@ -116,7 +117,8 @@ public: // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)"); + transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do " + "transpose lhs matrix (0)"); } /** Prevent instances of this class from being copied (As this class contains pointers) */ GemmConfigOptions(const GemmConfigOptions &) = delete; @@ -129,17 +131,18 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ + SimpleOption<size_t> * + transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ }; /** Consumes the gemm configuration options and creates a structure containing all information @@ -168,8 +171,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMReshapeLHSMatrix = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>; -using CLGEMMLowpMatrixMultiplyReshaped = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>; +using ClGemmReshapeLHSMatrix = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>; +using ClGemmLowpMatrixMultiplyReshaped = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedKernel>; class CLGEMMLowpMatrixMultiplyReshapedExample : public Example { @@ -186,12 +189,12 @@ public: GemmConfigOptions config_options(parser); parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -217,10 +220,7 @@ public: rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type)); // Set arbitrary quantization information - const QuantizationInfo q_info - { - 0.012, 3 - }; + const QuantizationInfo q_info{0.012, 3}; lhs.info()->set_quantization_info(q_info); rhs.info()->set_quantization_info(q_info); dst.info()->set_quantization_info(q_info); @@ -240,49 +240,53 @@ public: rhs_info.transpose = configs.transpose_rhs; rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet - lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U); + } + + lhs_reshaped.allocator()->init( + TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); lhs_reshaped.info()->set_quantization_info(q_info); rhs_reshaped.info()->set_quantization_info(q_info); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; } } - GEMMReshapeInfo gemm_info - { - static_cast<int>(params.M), - static_cast<int>(params.N), - static_cast<int>(params.K), - static_cast<int>(configs.h0), - static_cast<int>(configs.v0), - 0, - false, - true - }; + GEMMReshapeInfo gemm_info{static_cast<int>(params.M), + static_cast<int>(params.N), + static_cast<int>(params.K), + static_cast<int>(configs.h0), + static_cast<int>(configs.v0), + 0, + false, + true}; // Validate argments - if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d())) + if (!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d())) { - std::cerr << "Invalid arguments for CLGEMMReshapeLHSMatrixKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl; return false; } - if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info)) + if (!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info)) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl; return false; } // Configure functions reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, gemm_info); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info); // Allocate tensors lhs.allocator()->allocate(); @@ -295,10 +299,11 @@ public: } void do_run() override { - ITensorPack reshape_lsh_pack({ { ACL_SRC, &lhs }, { ACL_DST, &lhs_reshaped } }); + ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}}); reshape_lhs.run(reshape_lsh_pack); - gemm.run(); + ITensorPack gemm_pack({{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -315,8 +320,8 @@ private: CLTensor rhs_reshaped{}; CLTensor dst{}; CLTuner tuner{}; - CLGEMMReshapeLHSMatrix reshape_lhs{}; - CLGEMMLowpMatrixMultiplyReshaped gemm{}; + ClGemmReshapeLHSMatrix reshape_lhs{}; + ClGemmLowpMatrixMultiplyReshaped gemm{}; }; /** Main test program for gemmlowp reshaped diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp index d8f8f1498a..4acb316a3c 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,28 +25,29 @@ #error "This example needs to be built with -DARM_COMPUTE_CL" #endif /* ARM_COMPUTE_CL */ -#include "CommonGemmExampleOptions.h" -#include "GemmTunerHelpers.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" + +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" +#include "CommonGemmExampleOptions.h" +#include "GemmTunerHelpers.h" #include <cstdlib> #include <memory> using namespace arm_compute; using namespace utils; +using namespace arm_compute::opencl::kernels; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -55,12 +56,12 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix */ }; /** Formatted output of the GemmConfigs type @@ -118,10 +119,10 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable) */ }; @@ -146,8 +147,9 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>; -using CLGEMMLowpMatrixAReduction = test::CLSynthetizeFunction<CLGEMMLowpMatrixAReductionKernel>; +using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = + test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>; +using ClGemmLowpMatrixAReduction = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>; class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example { @@ -164,12 +166,12 @@ public: GemmConfigOptions config_options(parser); parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -198,10 +200,7 @@ public: // Set arbitrary quantization information (non-zero offset to ensure offset contribution stage is included) // Could be extended in the future to include a user-controlled option for offset == 0 - const QuantizationInfo q_info - { - 0.012, 3 - }; + const QuantizationInfo q_info{0.012, 3}; lhs.info()->set_quantization_info(q_info); rhs.info()->set_quantization_info(q_info); bias.info()->set_quantization_info(q_info); @@ -219,11 +218,17 @@ public: rhs_info.transpose = configs.transpose_rhs; rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U); + } + + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); rhs_reshaped.info()->set_quantization_info(q_info); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; @@ -236,7 +241,6 @@ public: gemmlowp_output_stage.output_data_type = dst.info()->data_type(); gemmlowp_output_stage.gemmlowp_offset = 0; { - const int idx_kernels = get_data_layout_dimension_index(lhs.info()->data_layout(), DataLayoutDimension::BATCHES); gemmlowp_output_stage.is_quantized_per_channel = false; // Num_filters is 1 unless quantized type is of per_channel type. Could be extended in the future to support per-channel quantization. const unsigned int num_filters = 1; @@ -246,10 +250,7 @@ public: gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); - quantization::compute_quantized_multipliers_and_shifts(lhs.info(), - rhs.info(), - dst.info(), - idx_kernels, + quantization::compute_quantized_multipliers_and_shifts(lhs.info(), rhs.info(), dst.info(), gemmlowp_output_stage.gemmlowp_multipliers.data(), gemmlowp_output_stage.gemmlowp_shifts.data()); gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; @@ -286,23 +287,23 @@ public: gemm_info.output_stage = gemmlowp_output_stage; // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(gemm_info.b_offset != 0) + if (gemm_info.b_offset != 0) { const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32); vector_sum_row.allocator()->init(info_vector_sum_row); - mtx_a_reduction = std::make_unique<CLGEMMLowpMatrixAReduction>(); + mtx_a_reduction = std::make_unique<ClGemmLowpMatrixAReduction>(); - if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{})) + if (!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{})) { std::cerr << "Invalid arguments for CLGEMMLowpMatrixAReductionKernel." << std::endl; return false; } - mtx_a_reduction->configure(&lhs, &vector_sum_row, GEMMLowpReductionKernelInfo{}); + mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}); } // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { const TensorInfo info_vector_sum_col(compute_reductionA_shape(*rhs.info()), 1, DataType::S32); vector_sum_col.allocator()->init(info_vector_sum_col); @@ -310,15 +311,20 @@ public: } // Validate argments - if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), - gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info())) + if (!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, + gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), + gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), + dst_multipliers.info(), dst_shifts.info())) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl; return false; } // Configure function - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info, gemm_info.a_offset == 0 ? nullptr : &vector_sum_col, gemm_info.b_offset == 0 ? nullptr : &vector_sum_row, &bias, &dst_multipliers, &dst_shifts); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, + gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), + gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), + dst_shifts.info()); // Allocate tensors lhs.allocator()->allocate(); @@ -335,11 +341,21 @@ public: } void do_run() override { - if(mtx_a_reduction != nullptr) + if (mtx_a_reduction != nullptr) { - mtx_a_reduction->run(); + ITensorPack red_pack({{ACL_SRC, &lhs}, {ACL_DST, &dst}}); + mtx_a_reduction->run(red_pack); } - gemm.run(); + + ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, + {ACL_SRC_1, &rhs}, + {ACL_BIAS, &bias}, + {ACL_VEC_COL_SUM, &vector_sum_col}, + {ACL_VEC_ROW_SUM, &vector_sum_row}, + {ACL_SHIFTS, &dst_shifts}, + {ACL_MULTIPLIERS, &dst_multipliers}, + {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -360,8 +376,8 @@ private: CLTensor dst_multipliers{}; CLTensor dst_shifts{}; CLTuner tuner{}; - CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm{}; - std::unique_ptr<CLGEMMLowpMatrixAReduction> mtx_a_reduction{ nullptr }; + ClGemmLowpMatrixMultiplyReshapedOnlyRhs gemm{}; + std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{nullptr}; }; /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint |