diff options
author | Anton Lokhmotov <psyhtest@users.noreply.github.com> | 2017-11-20 11:02:10 +0000 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:35:24 +0000 |
commit | 3e80c7fa601d5996e8ada3b2f6c69327f066ec17 (patch) | |
tree | e1d4f1c8c1dafe46005feb4e716ed80b6bbe9489 /src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | |
parent | d7295b7079f6b9126596cea998146ca9c6e87706 (diff) | |
download | ComputeLibrary-3e80c7fa601d5996e8ada3b2f6c69327f066ec17.tar.gz |
COMPMID-661: Optimize FC layer with 2 new Bifrost kernels and LWS tuning (#33)
Change-Id: Ie56ac88dff5ff339572cec562e8cd62dc7f0aa8b
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/109805
Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com <bsgcomp@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp')
-rw-r--r-- | src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 127 |
1 files changed, 71 insertions, 56 deletions
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index b184c507ff..d39dcdb336 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -38,7 +38,6 @@ #include "arm_compute/core/Window.h" #include <set> -#include <sstream> #include <string> using namespace arm_compute; @@ -53,7 +52,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - if(!is_interleaved_transposed) { ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); @@ -63,49 +61,44 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _input1 = input1; _output = output; - if(output->info()->dimension(1) == 196) - { - _lws_hint = cl::NDRange(1, 7); - } - else - { - _lws_hint = cl::NDRange(8, 8); - } + const DataType data_type = input0->info()->data_type(); + const int fp_pos = input0->info()->fixed_point_position(); - std::set<std::string> build_opts; - build_opts.emplace(("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)))); - build_opts.emplace(("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)))); + // Get target architecture + GPUTarget arch_target = get_arch_from_target(get_target()); - if(is_data_type_fixed_point(input0->info()->data_type())) - { - build_opts.emplace(("-DALPHA=" + support::cpp11::to_string((input0->info()->data_type() == DataType::QS8 ? - sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) : - sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position()))))); + // Configure LWS hint + _lws_hint = (output->info()->dimension(1) == 196) ? cl::NDRange(1, 7) : cl::NDRange(8, 8); - build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input0->info()->fixed_point_position()))); - } - else + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(is_data_type_fixed_point(data_type), "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(fp_pos)); + + const bool multiply_alpha = std::abs(1.0f - alpha) > 0.00001f; + + // Only define ALPHA when alpha is not 1.0f. This avoids performing unnecessary multiplications. + if(multiply_alpha) { - build_opts.emplace(("-DALPHA=" + float_to_string_with_full_precision(alpha))); + build_opts.add_option_if_else(is_data_type_fixed_point(data_type), + "-DALPHA=" + support::cpp11::to_string((data_type == DataType::QS8 ? sqcvt_qs8_f32(alpha, fp_pos) : sqcvt_qs16_f32(alpha, fp_pos))), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); } + std::string kernel_name; if(is_interleaved_transposed) { - // Create kernel - std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); - - if(data_type_name == "f32") + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0))); + if(data_type == DataType::F32) { - GPUTarget arch_target = get_arch_from_target(get_target()); - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target), build_opts)); + kernel_name = "gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target); } else { - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_" + data_type_name, build_opts)); + kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)); } - // Configure window kernel - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + // Configure kernel window + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); constexpr unsigned int num_elems_processed_per_iteration_y = 4; Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); @@ -122,28 +115,47 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen } else // The input tensors have not been reshaped { - ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); - // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor. num_elems_processed_per_iteration_x is set up for the default case. + unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast<int>(output->info()->dimension(1)), 4); - build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()))); - build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x))); - build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y))); - - // Create kernel - if(is_data_type_fixed_point(input0->info()->data_type())) + // Create kernels according to the architecture, data type and input size. + if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32) { - std::string kernel_name = "gemm_mm_" + lower_string(string_from_data_type(input0->info()->data_type())); - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts)); + // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and + // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g. + // FC6 and FC7 of AlexNet and VGG-16). + if(input1->info()->dimension(0) <= 1000) + { + // Each work-item processes 2 elements in the X dimension. + num_elems_processed_per_iteration_x = 2; + kernel_name = "gemm_mm_floating_point_f32_bifrost_1000"; + } + else + { + // Each work-item processes 4 elements in the X dimension (as in the default case). + num_elems_processed_per_iteration_x = 4; + kernel_name = "gemm_mm_floating_point_f32_bifrost"; + } + // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels + // via exhaustive autotuning over a range of representative layer configurations. + _lws_hint = cl::NDRange(4); } - else + else if(is_data_type_fixed_point(data_type)) { - std::string kernel_name = "gemm_mm_floating_point"; - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel((kernel_name), build_opts)); + kernel_name = "gemm_mm_" + lower_string(string_from_data_type(data_type)); } + else // (MIDGARD and F32) or (F16) + { + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + kernel_name = "gemm_mm_floating_point"; + } + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y)); + build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x)); + // Configure window Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); @@ -157,18 +169,21 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); ICLKernel::configure(win); - - // Set config_id for enabling LWS tuning - _config_id = "gemm_"; - _config_id += (is_interleaved_transposed ? "reshaped_" : ""); - _config_id += lower_string(string_from_data_type(input0->info()->data_type())); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(1)); - _config_id += "_"; - _config_id += support::cpp11::to_string(output->info()->dimension(0)); - _config_id += "_"; - _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } + + // Create kernel + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); + + // Set config_id for enabling LWS tuning + _config_id = "gemm_"; + _config_id += (is_interleaved_transposed ? "reshaped_" : ""); + _config_id += lower_string(string_from_data_type(input0->info()->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(output->info()->dimension(0)); + _config_id += "_"; + _config_id += (is_interleaved_transposed ? support::cpp11::to_string(input1->info()->dimension(0)) : support::cpp11::to_string(input1->info()->dimension(1))); } void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &queue) |