From b9d38ee6378f3035f8dbad442223d3d9e2f3dc4f Mon Sep 17 00:00:00 2001 From: Frank Lei Date: Tue, 5 Dec 2017 10:43:33 +0800 Subject: APPBROWSER-312 Fully connected performance optimization Change-Id: Ie93fd630ebbad7b6ca8812cb5044b3f1908b45fd Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/111830 Reviewed-by: Stephen Li Tested-by: BSG Visual Compute Jenkins server to access repositories on http://mpd-gerrit.cambridge.arm.com Reviewed-by: Anthony Barbier --- .../kernels/GCDirectConvolutionLayerKernel.cpp | 19 ++++++++- .../kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp | 46 +++++++++++++++++----- .../kernels/GCGEMMMatrixMultiplyKernel.cpp | 30 +++++++++++++- src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp | 33 +++++++++++++++- .../GLES_COMPUTE/kernels/GCTransposeKernel.cpp | 32 ++++++++++++--- 5 files changed, 142 insertions(+), 18 deletions(-) (limited to 'src/core/GLES_COMPUTE/kernels') diff --git a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp index b032bc5668..a7d721d035 100644 --- a/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCDirectConvolutionLayerKernel.cpp @@ -53,7 +53,6 @@ template void GCDirectConvolutionLayerKernel::configure(const IGCTensor *input, const IGCTensor *weights, const IGCTensor *bias, IGCTensor *output, const PadStrideInfo &conv_info) { ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); ARM_COMPUTE_ERROR_ON(weights->info()->dimension(2) != input->info()->dimension(2)); ARM_COMPUTE_ERROR_ON(weights->info()->dimension(0) != weights->info()->dimension(1)); ARM_COMPUTE_ERROR_ON(weights->info()->num_dimensions() > 4); @@ -68,6 +67,24 @@ void GCDirectConvolutionLayerKernel::configure(const IGCTensor *inp ARM_COMPUTE_ERROR_ON(bias->info()->num_dimensions() > 1); } + // Get convolved dimensions + unsigned int owidth = 0; + unsigned int oheight = 0; + std::tie(owidth, oheight) = scaled_dimensions(input->info()->dimension(0), input->info()->dimension(1), kernel_size, kernel_size, conv_info); + + TensorShape output_shape = input->info()->tensor_shape(); + output_shape.set(0, owidth); + output_shape.set(1, oheight); + output_shape.set(2, weights->info()->dimension(3)); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position()); + + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT(input, output); + _conv_stride_x = std::get<0>(conv_info.stride()); _conv_stride_y = std::get<1>(conv_info.stride()); _conv_pad_x = std::get<0>(conv_info.pad()); diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp index 8625d371e5..944585daff 100644 --- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixAccumulateBiasesKernel.cpp @@ -37,7 +37,7 @@ using namespace arm_compute; GCGEMMMatrixAccumulateBiasesKernel::GCGEMMMatrixAccumulateBiasesKernel() - : _accum(nullptr), _biases(nullptr) + : _accum(nullptr), _biases(nullptr), _lws(gles::NDRange(1U, 1U, 1U)) { } @@ -51,14 +51,23 @@ void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTe _accum = accum; std::set build_opts; - build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); - build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); - build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(_lws[0])); + build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(_lws[1])); + build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(_lws[2])); // Create kernel build_opts.emplace("#define GEMM_ACCUMULATE_BIASES"); + +#define ACCUM_PROCESS_4X + +#if defined(ACCUM_PROCESS_4X) + build_opts.emplace("#define ACCUM_PROCESS_4X"); +#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */ + build_opts.emplace("#define ACCUM_PROCESS_8X"); +#endif /* ACCUM_PROCESS_4X */ std::string dt_name = (accum->info()->data_type() == DataType::F32) ? "DATA_TYPE_FP32" : "DATA_TYPE_FP16"; build_opts.emplace(("#define " + dt_name)); + _kernel = GCKernelLibrary::get().create_kernel("gemm_accumulate_biases", build_opts); // Configure kernel window @@ -70,13 +79,21 @@ void GCGEMMMatrixAccumulateBiasesKernel::configure(IGCTensor *accum, const IGCTe } else if(_accum->info()->data_type() == DataType::F16) { +#if defined(ACCUM_PROCESS_4X) num_elems_processed_per_iteration = 4; +#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */ + num_elems_processed_per_iteration = 8; +#endif /* ACCUM_PROCESS_4X */ } - Window win = calculate_max_window(*_accum->info(), Steps(num_elems_processed_per_iteration)); + const int accum_width = accum->info()->dimension(0); + const int accum_padding_right = ceil_to_multiple(accum_width, num_elems_processed_per_iteration * _lws[0]) - accum_width; + BorderSize border = BorderSize(0, accum_padding_right, 0, 0); + + Window win = calculate_max_enlarged_window(*_accum->info(), Steps(num_elems_processed_per_iteration), border); - AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration), biases->info()->dimension(1)); - AccessWindowHorizontal accum_access(_accum->info(), 0, num_elems_processed_per_iteration); + AccessWindowStatic biases_access(biases->info(), 0, 0, ceil_to_multiple(biases->info()->dimension(0), num_elems_processed_per_iteration * _lws[0]), biases->info()->dimension(1)); + AccessWindowStatic accum_access(_accum->info(), 0, 0, accum_width + accum_padding_right, _accum->info()->dimension(1)); update_window_and_padding(win, biases_access, accum_access); @@ -107,13 +124,22 @@ void GCGEMMMatrixAccumulateBiasesKernel::run(const Window &window) } else if(_accum->info()->data_type() == DataType::F16) { - add_2D_tensor_argument(idx, _accum, BufferParam(1, 3), accum_slice); - add_1D_tensor_argument(idx, _biases, BufferParam(2, 3), biases_slice); +#if defined(ACCUM_PROCESS_4X) + BufferParam param = { 1, 3 }; + add_2D_tensor_argument(idx, _accum, param, accum_slice); + param.binding_point = 2; + add_1D_tensor_argument(idx, _biases, param, biases_slice); +#elif defined(ACCUM_PROCESS_8X) /* ACCUM_PROCESS_4X */ + BufferParam param = { 1, 4 }; + add_2D_tensor_argument(idx, _accum, param, accum_slice); + param.binding_point = 2; + add_1D_tensor_argument(idx, _biases, param, biases_slice); +#endif /* ACCUM_PROCESS_4X */ } _kernel.update_shader_params(); - enqueue(*this, accum_slice); + enqueue(*this, accum_slice, _lws); } while(window.slide_window_slice_2D(accum_slice)); } diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp index a75ab6b609..8179525470 100644 --- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp @@ -118,9 +118,23 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen switch(input0->info()->data_type()) { case DataType::F16: + build_opts.emplace("#define DATA_TYPE_FP16"); + +#define MM_PROCESS_4X_OPTIMIZED + +#if defined(MM_PROCESS_4X) + num_elems_processed_per_iteration_x = 4; + num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); + build_opts.emplace("#define MM_PROCESS_4X"); +#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */ num_elems_processed_per_iteration_x = 4; + num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); + build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED"); +#elif defined(MM_PROCESS_8X) /* MM_PROCESS_4X */ + num_elems_processed_per_iteration_x = 8; num_elems_processed_per_iteration_y = 1; - build_opts.emplace("#define DATA_TYPE_FP16"); + build_opts.emplace("#define MM_PROCESS_8X"); +#endif /* MM_PROCESS_4X */ break; case DataType::F32: @@ -143,8 +157,12 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); +#if defined(MM_PROCESS_4X_OPTIMIZED) + AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); +#else /* MM_PROCESS_4X_OPTIMIZED */ AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); +#endif /* MM_PROCESS_4X_OPTIMIZED */ AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); @@ -185,9 +203,19 @@ void GCGEMMMatrixMultiplyKernel::run(const Window &window) switch(_input0->info()->data_type()) { case DataType::F16: +#if defined(MM_PROCESS_4X) add_2D_tensor_argument(idx, _input0, BufferParam(1, 2), slice); add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b); add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice); +#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */ + add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice); + add_2D_tensor_argument(idx, _input1, BufferParam(2, 3), slice_b); + add_2D_tensor_argument(idx, _output, BufferParam(3, 3), slice); +#elif defined(MM_PROCESS_8X) /* MM_PROCESS_4X */ + add_2D_tensor_argument(idx, _input0, BufferParam(1, 4), slice); + add_2D_tensor_argument(idx, _input1, BufferParam(2, 4), slice_b); + add_2D_tensor_argument(idx, _output, BufferParam(3, 4), slice); +#endif /* MM_PROCESS_4X */ break; case DataType::F32: diff --git a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp index 97c4dc48a1..e849891c7c 100644 --- a/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCIm2ColKernel.cpp @@ -107,7 +107,38 @@ void GCIm2ColKernel::configure(const IGCTensor *input, IGCTensor *output, std::p else { build_opts.insert("#define IM2COL_REDUCED"); - _num_elems_processed_per_iteration = 4 / input->info()->element_size(); + + if(input->info()->data_type() == DataType::F32) + { + _num_elems_processed_per_iteration = 4 / input->info()->element_size(); + } + else if(input->info()->data_type() == DataType::F16) + { + int input_width = input->info()->dimension(0); + int input_height = input->info()->dimension(1); + + build_opts.insert("#define IMAGE_SIZE " + support::cpp11::to_string(input_width * input_height)); + if(input_width % 8 == 0) + { + _num_elems_processed_per_iteration = 8; + build_opts.insert("#define IM2COL_REDUCED_8X"); + } + else if(input_width % 4 == 0) + { + _num_elems_processed_per_iteration = 4; + build_opts.insert("#define IM2COL_REDUCED_4X"); + } + else if(input_width % 2 == 0) + { + _num_elems_processed_per_iteration = 2; + build_opts.insert("#define IM2COL_REDUCED_2X"); + } + else + { + _num_elems_processed_per_iteration = 2; + build_opts.insert("#define IM2COL_REDUCED_GENERIC"); + } + } // Create kernel _kernel = static_cast(GCKernelLibrary::get().create_kernel("im2col_reduced", build_opts)); diff --git a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp index 5bd34c2c85..acb998840b 100644 --- a/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCTransposeKernel.cpp @@ -64,12 +64,25 @@ void GCTransposeKernel::configure(const IGCTensor *input, IGCTensor *output) build_opts.emplace("#define LOCAL_SIZE_Y " + support::cpp11::to_string(1)); build_opts.emplace("#define LOCAL_SIZE_Z " + support::cpp11::to_string(1)); + // Configure kernel window + unsigned int num_elems_processed_per_iteration = 4; + + if(input->info()->data_type() == DataType::F16) + { +#define TRANSPOSE_8X8 + +#if defined(TRANSPOSE_4X4) + build_opts.emplace(("#define TRANSPOSE_4X4")); + num_elems_processed_per_iteration = 4; +#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */ + build_opts.emplace(("#define TRANSPOSE_8X8")); + num_elems_processed_per_iteration = 8; +#endif /* TRANSPOSE_4X4 */ + } + // Create kernel _kernel = static_cast(GCKernelLibrary::get().create_kernel("transpose", build_opts)); - // Configure kernel window - const unsigned int num_elems_processed_per_iteration = 4; - Window win = calculate_max_window(*input->info(), Steps(num_elems_processed_per_iteration, num_elems_processed_per_iteration)); AccessWindowRectangle input_access(input->info(), 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); @@ -100,8 +113,17 @@ void GCTransposeKernel::run(const Window &window) } else if(_input->info()->data_type() == DataType::F16) { - add_2D_tensor_argument(idx, _input, BufferParam(1, 3), slice); - add_2D_tensor_argument(idx, _output, BufferParam(2, 3), slice); +#if defined(TRANSPOSE_4X4) + BufferParam param = { 1, 3 }; + add_2D_tensor_argument(idx, _input, param, slice); + param.binding_point = 2; + add_2D_tensor_argument(idx, _output, param, slice); +#elif defined(TRANSPOSE_8X8) /* TRANSPOSE_4X4 */ + BufferParam param = { 1, 4 }; + add_2D_tensor_argument(idx, _input, param, slice); + param.binding_point = 2; + add_2D_tensor_argument(idx, _output, param, slice); +#endif /* TRANSPOSE_4X4 */ } _kernel.update_shader_params(); -- cgit v1.2.1