diff options
Diffstat (limited to 'examples/gemm_tuner')
-rw-r--r-- | examples/gemm_tuner/CommonGemmExampleOptions.cpp | 16 | ||||
-rw-r--r-- | examples/gemm_tuner/CommonGemmExampleOptions.h | 28 | ||||
-rw-r--r-- | examples/gemm_tuner/GemmTuner.py | 2 | ||||
-rw-r--r-- | examples/gemm_tuner/GemmTunerHelpers.h | 6 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemm_native.cpp | 32 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemm_reshaped.cpp | 101 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp | 68 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemmlowp_reshaped.cpp | 114 | ||||
-rw-r--r-- | examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp | 112 |
9 files changed, 268 insertions, 211 deletions
diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.cpp b/examples/gemm_tuner/CommonGemmExampleOptions.cpp index f1306ccf5c..c2a465604a 100644 --- a/examples/gemm_tuner/CommonGemmExampleOptions.cpp +++ b/examples/gemm_tuner/CommonGemmExampleOptions.cpp @@ -39,7 +39,8 @@ using namespace utils; return os; } -CommonGemmExampleOptions::CommonGemmExampleOptions(CommandLineParser &parser, DataType default_data_type) +CommonGemmExampleOptions::CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, + arm_compute::DataType default_data_type) : help(parser.add_option<ToggleOption>("help")), M(parser.add_positional_option<SimpleOption<size_t>>("M", 100)), N(parser.add_positional_option<SimpleOption<size_t>>("N", 100)), @@ -48,21 +49,16 @@ CommonGemmExampleOptions::CommonGemmExampleOptions(CommandLineParser &parser, Da data_type(), tuner_mode() { - const std::set<DataType> supported_data_types - { + const std::set<DataType> supported_data_types{ DataType::F16, DataType::F32, DataType::QASYMM8, }; - const std::set<CLTunerMode> supported_tuner_modes - { - CLTunerMode::EXHAUSTIVE, - CLTunerMode::NORMAL, - CLTunerMode::RAPID - }; + const std::set<CLTunerMode> supported_tuner_modes{CLTunerMode::EXHAUSTIVE, CLTunerMode::NORMAL, CLTunerMode::RAPID}; - ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(), "Default data type unsupported"); + ARM_COMPUTE_ERROR_ON_MSG(supported_data_types.find(default_data_type) == supported_data_types.end(), + "Default data type unsupported"); data_type = parser.add_option<EnumOption<DataType>>("type", supported_data_types, default_data_type); tuner_mode = parser.add_option<EnumOption<CLTunerMode>>("tuner-mode", supported_tuner_modes, CLTunerMode::RAPID); diff --git a/examples/gemm_tuner/CommonGemmExampleOptions.h b/examples/gemm_tuner/CommonGemmExampleOptions.h index f7447e3db3..38178bcef8 100644 --- a/examples/gemm_tuner/CommonGemmExampleOptions.h +++ b/examples/gemm_tuner/CommonGemmExampleOptions.h @@ -27,21 +27,22 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "utils/TypePrinter.h" + #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/TypePrinter.h" namespace gemm_tuner { /** Structure holding all the common gemm example parameters */ struct CommonGemmExampleParams { - size_t M{ 100 }; /**< Number of lhs matrix rows */ - size_t N{ 100 }; /**< Number of rhs matrix columns */ - size_t K{ 50 }; /**< Number of lhs matrix columns/rhs matrix rows */ - size_t B{ 1 }; /**< Batch size */ - arm_compute::DataType data_type{ arm_compute::DataType::F32 }; /**< Data type */ - arm_compute::CLTunerMode tuner_mode{ arm_compute::CLTunerMode::RAPID }; /**< OpenCL tuner mode */ + size_t M{100}; /**< Number of lhs matrix rows */ + size_t N{100}; /**< Number of rhs matrix columns */ + size_t K{50}; /**< Number of lhs matrix columns/rhs matrix rows */ + size_t B{1}; /**< Batch size */ + arm_compute::DataType data_type{arm_compute::DataType::F32}; /**< Data type */ + arm_compute::CLTunerMode tuner_mode{arm_compute::CLTunerMode::RAPID}; /**< OpenCL tuner mode */ }; /** Formatted output of the CommonGemmExampleParams type @@ -70,7 +71,8 @@ public: * @param[in,out] parser A parser on which "parse()" hasn't been called yet. * @param[in] default_data_type Default data type if unspecified. */ - CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, arm_compute::DataType default_data_type = arm_compute::DataType::F32); + CommonGemmExampleOptions(arm_compute::utils::CommandLineParser &parser, + arm_compute::DataType default_data_type = arm_compute::DataType::F32); /** Prevent instances of this class from being copied (As this class contains pointers) */ CommonGemmExampleOptions(const CommonGemmExampleOptions &) = delete; /** Prevent instances of this class from being copied (As this class contains pointers) */ @@ -82,11 +84,11 @@ public: /** Default destructor */ ~CommonGemmExampleOptions() = default; - arm_compute::utils::ToggleOption *help; /**< Show help option */ - arm_compute::utils::SimpleOption<size_t> *M; /**< Number of lhs matrix rows option */ - arm_compute::utils::SimpleOption<size_t> *N; /**< Number of rhs matrix columns option */ - arm_compute::utils::SimpleOption<size_t> *K; /**< Number of lhs matrix columns/rhs matrix rows option */ - arm_compute::utils::SimpleOption<size_t> *B; /**< Batch size option */ + arm_compute::utils::ToggleOption *help; /**< Show help option */ + arm_compute::utils::SimpleOption<size_t> *M; /**< Number of lhs matrix rows option */ + arm_compute::utils::SimpleOption<size_t> *N; /**< Number of rhs matrix columns option */ + arm_compute::utils::SimpleOption<size_t> *K; /**< Number of lhs matrix columns/rhs matrix rows option */ + arm_compute::utils::SimpleOption<size_t> *B; /**< Batch size option */ arm_compute::utils::EnumOption<arm_compute::DataType> *data_type; /**< Data type */ arm_compute::utils::EnumOption<arm_compute::CLTunerMode> *tuner_mode; /**< OpenCL tuner mode */ }; diff --git a/examples/gemm_tuner/GemmTuner.py b/examples/gemm_tuner/GemmTuner.py index 3e75051ffc..ef1f31493e 100644 --- a/examples/gemm_tuner/GemmTuner.py +++ b/examples/gemm_tuner/GemmTuner.py @@ -1,4 +1,4 @@ -# Copyright (c) 2019-2020 ARM Limited. +# Copyright (c) 2019-2020 Arm Limited. # # SPDX-License-Identifier: MIT # diff --git a/examples/gemm_tuner/GemmTunerHelpers.h b/examples/gemm_tuner/GemmTunerHelpers.h index ae5cfbb19e..dbff9e2dff 100644 --- a/examples/gemm_tuner/GemmTunerHelpers.h +++ b/examples/gemm_tuner/GemmTunerHelpers.h @@ -36,9 +36,9 @@ bool update_padding_for_cl_image(arm_compute::ITensorInfo *tensor) constexpr unsigned int num_floats_per_pixel = 4; const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size(); - const unsigned int pixel_aligment = arm_compute::get_cl_image_pitch_alignment( - arm_compute::CLKernelLibrary::get().get_device()); - if(pixel_aligment == 0) + const unsigned int pixel_aligment = + arm_compute::get_cl_image_pitch_alignment(arm_compute::CLKernelLibrary::get().get_device()); + if (pixel_aligment == 0) { return false; } diff --git a/examples/gemm_tuner/cl_gemm_native.cpp b/examples/gemm_tuner/cl_gemm_native.cpp index 5a144dabf7..7daa0b07d3 100644 --- a/examples/gemm_tuner/cl_gemm_native.cpp +++ b/examples/gemm_tuner/cl_gemm_native.cpp @@ -25,22 +25,24 @@ #error "This example needs to be built with -DARM_COMPUTE_CL" #endif /* ARM_COMPUTE_CL */ -#include "CommonGemmExampleOptions.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/CL/kernels/CLGEMMMatrixMultiplyNativeKernel.h" + +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" +#include "CommonGemmExampleOptions.h" #include <cstdlib> using namespace arm_compute; +using namespace arm_compute::opencl::kernels; using namespace utils; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -50,9 +52,9 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ }; /** Formatted output of the GemmConfigs type @@ -122,8 +124,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } } // namespace -// Create function for CLGEMMMatrixMultiplyNativeKernel -using CLGEMMMatrixMultiplyNative = test::CLSynthetizeFunction<CLGEMMMatrixMultiplyNativeKernel>; +// Create function for ClGemmMatrixMultiplyNativeKernel +using CLGEMMMatrixMultiplyNative = test::CLSynthetizeOperator<ClGemmMatrixMultiplyNativeKernel>; class CLGEMMMatrixMultiplyNativeExample : public Example { @@ -144,13 +146,13 @@ public: // Parse command line options parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { // Print help message parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -197,8 +199,9 @@ public: // Validate argments Status status{}; - status = gemm.validate((&lhs)->info(), (&rhs)->info(), (&bias)->info(), (&dst)->info(), alpha, beta, lhs_info, rhs_info, kernel_info); - if(!status) + status = gemm.validate(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, + kernel_info); + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -207,7 +210,7 @@ public: } // Configure function - gemm.configure(&lhs, &rhs, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info); + gemm.configure(lhs.info(), rhs.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, kernel_info); // Allocate tensors lhs.allocator()->allocate(); @@ -220,7 +223,8 @@ public: void do_run() override { // Execute the function - gemm.run(); + ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); diff --git a/examples/gemm_tuner/cl_gemm_reshaped.cpp b/examples/gemm_tuner/cl_gemm_reshaped.cpp index 444a342d74..75f3539cb9 100644 --- a/examples/gemm_tuner/cl_gemm_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemm_reshaped.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,18 +31,20 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "examples/gemm_tuner/CommonGemmExampleOptions.h" #include "examples/gemm_tuner/GemmTunerHelpers.h" -#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedKernel.h" -#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" #include <cstdlib> using namespace arm_compute; +using namespace arm_compute::opencl::kernels; using namespace utils; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -52,16 +54,16 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ true }; /**< Interleave lhs matrix */ - bool transpose_lhs{ true }; /**< Transpose lhs matrix. */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix. */ - bool export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image. */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{true}; /**< Interleave lhs matrix */ + bool transpose_lhs{true}; /**< Transpose lhs matrix. */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix. */ + bool export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image. */ }; /** Formatted output of the GemmConfigs type @@ -118,8 +120,10 @@ public: // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)"); - export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); + transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do " + "transpose lhs matrix (0)"); + export_to_cl_image_rhs->set_help( + "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); } /** Prevent instances of this class from being copied (As this class contains pointers) */ GemmConfigOptions(const GemmConfigOptions &) = delete; @@ -132,17 +136,18 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ + SimpleOption<size_t> * + transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/ }; @@ -172,10 +177,11 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } } // namespace -// Create function for CLGEMMReshapeLHSMatrixKernel -using CLGEMMReshapeLHSMatrix = test::CLSynthetizeFunction<CLGEMMReshapeLHSMatrixKernel>; -// Create function for CLGEMMMatrixMultiplyReshapedKernel -using CLGEMMMatrixMultiplyReshaped = test::CLSynthetizeFunction<CLGEMMMatrixMultiplyReshapedKernel>; + +// Create function for ClGemmReshapeLhsMatrixKernel +using CLGEMMReshapeLHSMatrix = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>; +// Create function for ClGemmMatrixMultiplyReshapedKernel +using CLGEMMMatrixMultiplyReshaped = test::CLSynthetizeOperator<ClGemmMatrixMultiplyReshapedKernel>; class CLGEMMMatrixMultiplyReshapedExample : public Example { @@ -196,13 +202,13 @@ public: // Parse command line options parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { // Print help message parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -254,15 +260,22 @@ public: kernel_info.broadcast_bias = true; kernel_info.activation_info = act_info; + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U); + } + // Initialise lhs_reshaped tensor info - lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); + lhs_reshaped.allocator()->init( + TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); // Initialise rhs_reshaped tensor info - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; @@ -271,8 +284,8 @@ public: // Validate argments Status status{}; - status = reshape_lhs.validate((&lhs)->info(), (&lhs_reshaped)->info(), lhs_info, kernel_info.reinterpret_input_as_3d); - if(!status) + status = reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, kernel_info.reinterpret_input_as_3d); + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -280,8 +293,9 @@ public: return false; } - status = gemm.validate((&lhs_reshaped)->info(), (&rhs_reshaped)->info(), (&bias)->info(), (&dst)->info(), alpha, beta, lhs_info, rhs_info, kernel_info); - if(!status) + status = gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, + rhs_info, kernel_info); + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -290,10 +304,11 @@ public: } // Configure reshape lhs function - reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info); + reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); // Configure function - gemm.configure(&lhs_reshaped, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, + rhs_info, kernel_info); // Allocate tensors lhs.allocator()->allocate(); @@ -307,9 +322,13 @@ public: } void do_run() override { - // Execute the function - reshape_lhs.run(); - gemm.run(); + // Execute the functions + ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}}); + reshape_lhs.run(reshape_lsh_pack); + + ITensorPack gemm_pack( + {{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); diff --git a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp index 68bec9da6e..cfea2c9bac 100644 --- a/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp +++ b/examples/gemm_tuner/cl_gemm_reshaped_rhs_only.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2019-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,23 +25,25 @@ #error "This example needs to be built with -DARM_COMPUTE_CL" #endif /* ARM_COMPUTE_CL */ -#include "CommonGemmExampleOptions.h" -#include "GemmTunerHelpers.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/CL/kernels/CLGEMMMatrixMultiplyReshapedOnlyRHSKernel.h" + +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" +#include "CommonGemmExampleOptions.h" +#include "GemmTunerHelpers.h" #include <cstdlib> using namespace arm_compute; +using namespace arm_compute::opencl::kernels; using namespace utils; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -51,13 +53,13 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix */ - bool export_to_cl_image_rhs{ true }; /**< Export rhs matrix to cl_image.*/ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix */ + bool export_to_cl_image_rhs{true}; /**< Export rhs matrix to cl_image.*/ }; /** Formatted output of the GemmConfigs type @@ -105,7 +107,8 @@ public: h0->set_help("Number of horizontal blocks of size (k0xn0) stored on the same output row"); interleave_rhs->set_help("Interleave rhs matrix (1) / Do not interleave rhs matrix (0)"); transpose_rhs->set_help("Transpose rhs matrix (1) / Do not transpose rhs matrix (0)"); - export_to_cl_image_rhs->set_help("Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); + export_to_cl_image_rhs->set_help( + "Export rhs matrix to cl_image (1) / Do not export rhs matrix to cl_image (0)"); } /** Prevent instances of this class from being copied (As this class contains pointers) */ GemmConfigOptions(const GemmConfigOptions &) = delete; @@ -118,10 +121,10 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *export_to_cl_image_rhs; /**< Export rhs matrix to cl_image.*/ @@ -147,8 +150,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } } // namespace -// Create function for CLGEMMMatrixMultiplyReshapedOnlyRHSKernel -using CLGEMMMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeFunction<CLGEMMMatrixMultiplyReshapedOnlyRHSKernel>; +// Create function for ClGemmMatrixMultiplyReshapedOnlyRhsKernel +using CLGEMMMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeOperator<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>; class CLGEMMMatrixMultiplyReshapedOnlyRHSExample : public Example { @@ -169,13 +172,13 @@ public: // Parse command line options parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { // Print help message parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -224,12 +227,18 @@ public: kernel_info.broadcast_bias = true; kernel_info.activation_info = act_info; + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(kernel_info.n / rhs_info.n0, 1U); + } + // Initialise rhs_reshaped tensor info - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; @@ -238,8 +247,9 @@ public: // Validate argments Status status{}; - status = gemm.validate((&lhs)->info(), (&rhs_reshaped)->info(), (&bias)->info(), (&dst)->info(), alpha, beta, lhs_info, rhs_info, kernel_info); - if(!status) + status = gemm.validate(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, + rhs_info, kernel_info); + if (!status) { // Unsupported arguments std::cerr << "Unsupported arguments." << std::endl; @@ -248,7 +258,8 @@ public: } // Configure function - gemm.configure(&lhs, &rhs_reshaped, &bias, &dst, alpha, beta, lhs_info, rhs_info, kernel_info); + gemm.configure(lhs.info(), rhs_reshaped.info(), bias.info(), dst.info(), alpha, beta, lhs_info, rhs_info, + kernel_info); // Allocate tensors lhs.allocator()->allocate(); @@ -262,7 +273,8 @@ public: void do_run() override { // Execute the function - gemm.run(); + ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, {ACL_SRC_1, &rhs_reshaped}, {ACL_SRC_2, &bias}, {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp index 5b81963752..3808b98b7d 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -31,18 +31,20 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" + #include "examples/gemm_tuner/CommonGemmExampleOptions.h" #include "examples/gemm_tuner/GemmTunerHelpers.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedKernel.h" -#include "src/core/CL/kernels/CLGEMMReshapeLHSMatrixKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" #include <cstdlib> using namespace arm_compute; +using namespace arm_compute::opencl::kernels; using namespace utils; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -52,15 +54,15 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t v0{ 1 }; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_lhs{ true }; /**< Interleave lhs matrix */ - bool transpose_lhs{ true }; /**< Transpose lhs matrix. */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix. */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t v0{1}; /**< Number of vertical blocks of size (m0xk0) stored on the same output row */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_lhs{true}; /**< Interleave lhs matrix */ + bool transpose_lhs{true}; /**< Transpose lhs matrix. */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix. */ }; /** Formatted output of the GemmConfigs type @@ -115,7 +117,8 @@ public: // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do transpose lhs matrix (0)"); + transpose_rhs->set_help("Transpose rhs matrix but not lhs matrix (1) / Do not transpose rhs matrix but do " + "transpose lhs matrix (0)"); } /** Prevent instances of this class from being copied (As this class contains pointers) */ GemmConfigOptions(const GemmConfigOptions &) = delete; @@ -128,17 +131,18 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *v0; /**< Number of vertical blocks of size (m0xk0) stored on the same output row option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_lhs; /**< Interleave lhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ // FIXME: Currently we only support 2 variants of the gemm reshaped kernels in which transpose_lhs and // transpose_rhs are the opposites of each other. In the future we may extend the kernels to include the other // 2 variants (both transposed and none transposed) - SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ + SimpleOption<size_t> * + transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable). Also set the lhs matrix transpose option to the opposite. */ }; /** Consumes the gemm configuration options and creates a structure containing all information @@ -167,8 +171,8 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMReshapeLHSMatrix = test::CLSynthetizeFunction<CLGEMMReshapeLHSMatrixKernel>; -using CLGEMMLowpMatrixMultiplyReshaped = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedKernel>; +using ClGemmReshapeLHSMatrix = test::CLSynthetizeOperator<ClGemmReshapeLhsMatrixKernel>; +using ClGemmLowpMatrixMultiplyReshaped = test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedKernel>; class CLGEMMLowpMatrixMultiplyReshapedExample : public Example { @@ -185,12 +189,12 @@ public: GemmConfigOptions config_options(parser); parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -216,10 +220,7 @@ public: rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type)); // Set arbitrary quantization information - const QuantizationInfo q_info - { - 0.012, 3 - }; + const QuantizationInfo q_info{0.012, 3}; lhs.info()->set_quantization_info(q_info); rhs.info()->set_quantization_info(q_info); dst.info()->set_quantization_info(q_info); @@ -239,49 +240,53 @@ public: rhs_info.transpose = configs.transpose_rhs; rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet - lhs_reshaped.allocator()->init(TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U); + } + + lhs_reshaped.allocator()->init( + TensorInfo(compute_lhs_reshaped_shape(*lhs.info(), lhs_info), 1, params.data_type)); + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); lhs_reshaped.info()->set_quantization_info(q_info); rhs_reshaped.info()->set_quantization_info(q_info); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; } } - GEMMReshapeInfo gemm_info - { - static_cast<int>(params.M), - static_cast<int>(params.N), - static_cast<int>(params.K), - static_cast<int>(configs.h0), - static_cast<int>(configs.v0), - 0, - false, - true - }; + GEMMReshapeInfo gemm_info{static_cast<int>(params.M), + static_cast<int>(params.N), + static_cast<int>(params.K), + static_cast<int>(configs.h0), + static_cast<int>(configs.v0), + 0, + false, + true}; // Validate argments - if(!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d())) + if (!reshape_lhs.validate(lhs.info(), lhs_reshaped.info(), lhs_info, gemm_info.reinterpret_input_as_3d())) { - std::cerr << "Invalid arguments for CLGEMMReshapeLHSMatrixKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmReshapeLHSMatrixKernel." << std::endl; return false; } - if(!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info)) + if (!gemm.validate(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info)) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedKernel." << std::endl; return false; } // Configure functions - reshape_lhs.configure(&lhs, &lhs_reshaped, lhs_info); + reshape_lhs.configure(lhs.info(), lhs_reshaped.info(), lhs_info); - gemm.configure(&lhs_reshaped, &rhs_reshaped, &dst, lhs_info, rhs_info, gemm_info); + gemm.configure(lhs_reshaped.info(), rhs_reshaped.info(), dst.info(), lhs_info, rhs_info, gemm_info); // Allocate tensors lhs.allocator()->allocate(); @@ -294,8 +299,11 @@ public: } void do_run() override { - reshape_lhs.run(); - gemm.run(); + ITensorPack reshape_lsh_pack({{ACL_SRC, &lhs}, {ACL_DST, &lhs_reshaped}}); + reshape_lhs.run(reshape_lsh_pack); + + ITensorPack gemm_pack({{ACL_SRC_0, &lhs_reshaped}, {ACL_SRC_1, &rhs_reshaped}, {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -312,8 +320,8 @@ private: CLTensor rhs_reshaped{}; CLTensor dst{}; CLTuner tuner{}; - CLGEMMReshapeLHSMatrix reshape_lhs{}; - CLGEMMLowpMatrixMultiplyReshaped gemm{}; + ClGemmReshapeLHSMatrix reshape_lhs{}; + ClGemmLowpMatrixMultiplyReshaped gemm{}; }; /** Main test program for gemmlowp reshaped diff --git a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp index 95431ed50c..4acb316a3c 100644 --- a/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp +++ b/examples/gemm_tuner/cl_gemmlowp_reshaped_rhs_only_fused_output_stage_fixedpoint.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020-2021 Arm Limited. + * Copyright (c) 2020-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -25,28 +25,29 @@ #error "This example needs to be built with -DARM_COMPUTE_CL" #endif /* ARM_COMPUTE_CL */ -#include "CommonGemmExampleOptions.h" -#include "GemmTunerHelpers.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/KernelDescriptors.h" -#include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/CLTuner.h" -#include "src/core/CL/kernels/CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel.h" -#include "src/core/CL/kernels/CLGEMMLowpReductionKernel.h" + +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" #include "tests/CL/Helper.h" -#include "utils/Utils.h" #include "utils/command_line/CommandLineOptions.h" #include "utils/command_line/CommandLineParser.h" +#include "utils/Utils.h" +#include "CommonGemmExampleOptions.h" +#include "GemmTunerHelpers.h" #include <cstdlib> #include <memory> using namespace arm_compute; using namespace utils; +using namespace arm_compute::opencl::kernels; using namespace arm_compute::misc::shape_calculator; using namespace gemm_tuner; @@ -55,12 +56,12 @@ namespace /** Structure holding all tunable gemm configs specific to this example/strategy */ struct GemmConfigs { - size_t m0{ 4 }; /**< Number of rows processed by the matrix multiplication */ - size_t n0{ 4 }; /**< Number of columns processed by the matrix multiplication */ - size_t k0{ 4 }; /**< Number of partial accumulations performed by the matrix multiplication */ - size_t h0{ 1 }; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ - bool interleave_rhs{ true }; /**< Interleave rhs matrix */ - bool transpose_rhs{ true }; /**< Transpose rhs matrix */ + size_t m0{4}; /**< Number of rows processed by the matrix multiplication */ + size_t n0{4}; /**< Number of columns processed by the matrix multiplication */ + size_t k0{4}; /**< Number of partial accumulations performed by the matrix multiplication */ + size_t h0{1}; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row */ + bool interleave_rhs{true}; /**< Interleave rhs matrix */ + bool transpose_rhs{true}; /**< Transpose rhs matrix */ }; /** Formatted output of the GemmConfigs type @@ -118,10 +119,10 @@ public: /** Default destructor */ ~GemmConfigOptions() = default; - SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ - SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ - SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ - SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ + SimpleOption<size_t> *m0; /**< Number of rows processed by the matrix multiplication option */ + SimpleOption<size_t> *n0; /**< Number of columns processed by the matrix multiplication option */ + SimpleOption<size_t> *k0; /**< Number of partial accumulations performed by the matrix multiplication option */ + SimpleOption<size_t> *h0; /**< Number of horizontal blocks of size (k0xn0) stored on the same output row option */ SimpleOption<size_t> *interleave_rhs; /**< Interleave rhs matrix option (1 enable; 0 disable) */ SimpleOption<size_t> *transpose_rhs; /**< Transpose rhs matrix option (1 enable; 0 disable) */ }; @@ -146,8 +147,9 @@ GemmConfigs consume_gemm_configs(const GemmConfigOptions &options) } // namespace -using CLGEMMLowpMatrixMultiplyReshapedOnlyRHS = test::CLSynthetizeFunction<CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel>; -using CLGEMMLowpMatrixAReduction = test::CLSynthetizeFunction<CLGEMMLowpMatrixAReductionKernel>; +using ClGemmLowpMatrixMultiplyReshapedOnlyRhs = + test::CLSynthetizeOperator<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>; +using ClGemmLowpMatrixAReduction = test::CLSynthetizeOperator<ClGemmLowpMatrixAReductionKernel>; class CLGEMMLowpMatrixMultiplyReshapedOnlyRHSFusedOutputStageFixedpointExample : public Example { @@ -164,12 +166,12 @@ public: GemmConfigOptions config_options(parser); parser.parse(argc, argv); - if(param_options.help->is_set() && param_options.help->value()) + if (param_options.help->is_set() && param_options.help->value()) { parser.print_help(argv[0]); return false; } - if(!parser.validate()) + if (!parser.validate()) { // Invalid arguments. Use default parameters and configs std::cerr << "Invalid arguments." << std::endl; @@ -193,15 +195,12 @@ public: lhs.allocator()->init(TensorInfo(TensorShape(params.K, params.M, params.B), 1, params.data_type)); rhs.allocator()->init(TensorInfo(TensorShape(params.N, params.K, params.B), 1, params.data_type)); - bias.allocator()->init(TensorInfo(TensorShape(params.N, 1, params.B), 1, DataType::S32)); + bias.allocator()->init(TensorInfo(TensorShape(params.N), 1, DataType::S32)); dst.allocator()->init(TensorInfo(TensorShape(params.N, params.M, params.B), 1, params.data_type)); // Set arbitrary quantization information (non-zero offset to ensure offset contribution stage is included) // Could be extended in the future to include a user-controlled option for offset == 0 - const QuantizationInfo q_info - { - 0.012, 3 - }; + const QuantizationInfo q_info{0.012, 3}; lhs.info()->set_quantization_info(q_info); rhs.info()->set_quantization_info(q_info); bias.info()->set_quantization_info(q_info); @@ -219,11 +218,17 @@ public: rhs_info.transpose = configs.transpose_rhs; rhs_info.export_to_cl_image = false; // CL image not supported for quantized cases yet - rhs_reshaped.allocator()->init(TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); + if (rhs_info.h0 == 0) + { + rhs_info.h0 = std::max(static_cast<unsigned int>(params.N) / rhs_info.n0, 1U); + } + + rhs_reshaped.allocator()->init( + TensorInfo(compute_rhs_reshaped_shape(*rhs.info(), rhs_info), 1, params.data_type)); rhs_reshaped.info()->set_quantization_info(q_info); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - if(!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) + if (!examples::gemm_tuner_helpers::update_padding_for_cl_image(rhs_reshaped.info())) { std::cerr << "cl_image is not supported on the device, disable export_to_cl_image" << std::endl; return false; @@ -236,7 +241,6 @@ public: gemmlowp_output_stage.output_data_type = dst.info()->data_type(); gemmlowp_output_stage.gemmlowp_offset = 0; { - const int idx_kernels = get_data_layout_dimension_index(lhs.info()->data_layout(), DataLayoutDimension::BATCHES); gemmlowp_output_stage.is_quantized_per_channel = false; // Num_filters is 1 unless quantized type is of per_channel type. Could be extended in the future to support per-channel quantization. const unsigned int num_filters = 1; @@ -246,10 +250,7 @@ public: gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); - quantization::compute_quantized_multipliers_and_shifts(lhs.info(), - rhs.info(), - dst.info(), - idx_kernels, + quantization::compute_quantized_multipliers_and_shifts(lhs.info(), rhs.info(), dst.info(), gemmlowp_output_stage.gemmlowp_multipliers.data(), gemmlowp_output_stage.gemmlowp_shifts.data()); gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; @@ -286,23 +287,23 @@ public: gemm_info.output_stage = gemmlowp_output_stage; // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(gemm_info.b_offset != 0) + if (gemm_info.b_offset != 0) { const TensorInfo info_vector_sum_row(compute_reductionB_shape(*lhs.info()), 1, DataType::S32); vector_sum_row.allocator()->init(info_vector_sum_row); - mtx_a_reduction = std::make_unique<CLGEMMLowpMatrixAReduction>(); + mtx_a_reduction = std::make_unique<ClGemmLowpMatrixAReduction>(); - if(!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{})) + if (!mtx_a_reduction->validate(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{})) { std::cerr << "Invalid arguments for CLGEMMLowpMatrixAReductionKernel." << std::endl; return false; } - mtx_a_reduction->configure(&lhs, &vector_sum_row, GEMMLowpReductionKernelInfo{}); + mtx_a_reduction->configure(lhs.info(), vector_sum_row.info(), GEMMLowpReductionKernelInfo{}); } // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { const TensorInfo info_vector_sum_col(compute_reductionA_shape(*rhs.info()), 1, DataType::S32); vector_sum_col.allocator()->init(info_vector_sum_col); @@ -310,15 +311,20 @@ public: } // Validate argments - if(!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), - gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), dst_shifts.info())) + if (!gemm.validate(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, + gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), + gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), + dst_multipliers.info(), dst_shifts.info())) { - std::cerr << "Invalid arguments for CLGEMMLowpMatrixMultiplyReshapedOnlyRHSKernel." << std::endl; + std::cerr << "Invalid arguments for ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel." << std::endl; return false; } // Configure function - gemm.configure(&lhs, &rhs_reshaped, &dst, gemm_info, gemm_info.a_offset == 0 ? nullptr : &vector_sum_col, gemm_info.b_offset == 0 ? nullptr : &vector_sum_row, &bias, &dst_multipliers, &dst_shifts); + gemm.configure(lhs.info(), rhs_reshaped.info(), dst.info(), gemm_info, + gemm_info.a_offset == 0 ? nullptr : vector_sum_col.info(), + gemm_info.b_offset == 0 ? nullptr : vector_sum_row.info(), bias.info(), dst_multipliers.info(), + dst_shifts.info()); // Allocate tensors lhs.allocator()->allocate(); @@ -335,11 +341,21 @@ public: } void do_run() override { - if(mtx_a_reduction != nullptr) + if (mtx_a_reduction != nullptr) { - mtx_a_reduction->run(); + ITensorPack red_pack({{ACL_SRC, &lhs}, {ACL_DST, &dst}}); + mtx_a_reduction->run(red_pack); } - gemm.run(); + + ITensorPack gemm_pack({{ACL_SRC_0, &lhs}, + {ACL_SRC_1, &rhs}, + {ACL_BIAS, &bias}, + {ACL_VEC_COL_SUM, &vector_sum_col}, + {ACL_VEC_ROW_SUM, &vector_sum_row}, + {ACL_SHIFTS, &dst_shifts}, + {ACL_MULTIPLIERS, &dst_multipliers}, + {ACL_DST, &dst}}); + gemm.run(gemm_pack); // Make sure all the OpenCL jobs are done executing: CLScheduler::get().sync(); @@ -360,8 +376,8 @@ private: CLTensor dst_multipliers{}; CLTensor dst_shifts{}; CLTuner tuner{}; - CLGEMMLowpMatrixMultiplyReshapedOnlyRHS gemm{}; - std::unique_ptr<CLGEMMLowpMatrixAReduction> mtx_a_reduction{ nullptr }; + ClGemmLowpMatrixMultiplyReshapedOnlyRhs gemm{}; + std::unique_ptr<ClGemmLowpMatrixAReduction> mtx_a_reduction{nullptr}; }; /** Main test program for gemmlowp reshaped rhs only with fused output stage fixedpoint |