From edfa9f463bed084f8b0953557202b2a1e56da817 Mon Sep 17 00:00:00 2001 From: Gian Marco Iodice Date: Tue, 15 Aug 2017 11:45:22 +0100 Subject: COMPMID-477 - Optimized batched case in CLConvolutionLayer Change-Id: I4ef18f49f1da0cb816aaa0762466b940792c15ed Reviewed-on: http://mpd-gerrit.cambridge.arm.com/84162 Tested-by: Kaizen Reviewed-by: Anthony Barbier --- src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 96 ++++++++++++---------- 1 file changed, 53 insertions(+), 43 deletions(-) (limited to 'src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp') diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 39526a23e1..684e3232d5 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -48,13 +48,13 @@ CLGEMMMatrixMultiplyKernel::CLGEMMMatrixMultiplyKernel() { } -void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha) +void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTensor *input1, ICLTensor *output, float alpha, bool is_interleaved_transposed) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input0, input1, output); - if(output->info()->dimension(1) == 1) + if(!is_interleaved_transposed) { ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); } @@ -72,79 +72,89 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen _lws_hint = cl::NDRange(8, 8); } - std::ostringstream mm_arguments; - mm_arguments << "-DWIDTH_MATRIX_B=" << input1->info()->dimension(0) << " "; + std::set build_opts; + build_opts.emplace(("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0)))); + build_opts.emplace(("-DCOLS_B=" + support::cpp11::to_string(input1->info()->dimension(0)))); + if(is_data_type_fixed_point(input0->info()->data_type())) { - mm_arguments << "-DALPHA=" << (input0->info()->data_type() == DataType::QS8 ? - sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) : - sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position())) - << " "; - mm_arguments << "-DFIXED_POINT_POSITION=" << input0->info()->fixed_point_position() << " "; + build_opts.emplace(("-DALPHA=" + support::cpp11::to_string((input0->info()->data_type() == DataType::QS8 ? + sqcvt_qs8_f32(alpha, input0->info()->fixed_point_position()) : + sqcvt_qs16_f32(alpha, input0->info()->fixed_point_position()))))); + + build_opts.emplace(("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input0->info()->fixed_point_position()))); } else { - mm_arguments << "-DALPHA=" << alpha << " "; + build_opts.emplace(("-DALPHA=" + float_to_string_with_full_precision(alpha))); } - std::set build_opts; - // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication - if(output->info()->dimension(1) == 1) + if(is_interleaved_transposed) { - mm_arguments << "-DWIDTH_VECTOR_A=" << input0->info()->dimension(0) << " "; - build_opts.emplace(mm_arguments.str()); - // Create kernel std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); - _kernel = static_cast(CLKernelLibrary::get().create_kernel(("gemm_vm_" + data_type_name), build_opts)); + + if(data_type_name == "f32") + { + GPUTarget arch_target = get_arch_from_target(get_target()); + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_f32_" + string_from_target(arch_target), build_opts)); + } + else + { + _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_mm_interleaved_transposed_" + data_type_name, build_opts)); + } // Configure window kernel - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + constexpr unsigned int num_elems_processed_per_iteration_y = 4; - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x)); + Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->tensor_shape().x(), 1); - AccessWindowHorizontal input1_access(input1->info(), 0, num_elems_processed_per_iteration_x); - AccessWindowHorizontal output_access(output->info(), 0, num_elems_processed_per_iteration_x); + AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); update_window_and_padding(win, input0_access, input1_access, output_access); - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); + output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); ICLKernel::configure(win); } - else + else // The input tensors have not been reshaped { - build_opts.emplace(mm_arguments.str()); + ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); - // Create kernel - std::string data_type_name = lower_string(string_from_data_type(input0->info()->data_type())); + // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor + const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); + const unsigned int num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); - if(data_type_name == "f32") + build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input0->info()->data_type()))); + build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_X=" + support::cpp11::to_string(num_elems_processed_per_iteration_x))); + build_opts.emplace(("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elems_processed_per_iteration_y))); + + // Create kernel + if(is_data_type_fixed_point(input0->info()->data_type())) { - GPUTarget arch_target = get_arch_from_target(get_target()); - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_mm_f32_" + string_from_target(arch_target), build_opts)); + std::string kernel_name = "gemm_mm_" + lower_string(string_from_data_type(input0->info()->data_type())); + _kernel = static_cast(CLKernelLibrary::get().create_kernel((kernel_name), build_opts)); } else { - _kernel = static_cast(CLKernelLibrary::get().create_kernel("gemm_mm_" + data_type_name, build_opts)); + std::string kernel_name = "gemm_mm_floating_point"; + _kernel = static_cast(CLKernelLibrary::get().create_kernel((kernel_name), build_opts)); } - // Configure window kernel - const unsigned int num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(input0->info()->data_type()); - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - Window win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); - AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowStatic input0_access(input0->info(), 0, 0, input0->info()->dimension(0), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); update_window_and_padding(win, input0_access, input1_access, output_access); - output_access.set_valid_region(win, ValidRegion(Coordinates(0, 0), output->info()->tensor_shape())); + Coordinates coord; + coord.set_num_dimensions(output->info()->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); ICLKernel::configure(win); } @@ -157,9 +167,9 @@ void CLGEMMMatrixMultiplyKernel::run(const Window &window, cl::CommandQueue &que Window slice = window.first_slice_window_2D(); Window slice_matrix_b = slice; - slice_matrix_b.set(Window::DimX, Window::Dimension(0, _input1->info()->dimension(0), 1)); - slice_matrix_b.set(Window::DimY, Window::Dimension(0, _input1->info()->dimension(1), 1)); - slice_matrix_b.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); do { -- cgit v1.2.1