From 164b65d3c8f61f1d6d404fb484c1998a20a2cbda Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Fri, 13 Apr 2018 14:28:08 +0100 Subject: COMPMID-1043: Rework GCGEMMMatrixMultiplyKernel interface and allow auto initialization of the tensors This patch also: - removes support for already reshaped weights in GCConvolutionLayer - makes GCConvolutionLayer similar to CLGEMMConvolutionLayer - enables usage of the GCGEMM function in GCConvolution instead of calling the GEMM kernels directly Change-Id: I3e4a64335555e86e18585d38d8fda4bfdb44e265 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/127696 Tested-by: Jenkins Reviewed-by: Anthony Barbier --- .../kernels/GCGEMMMatrixMultiplyKernel.cpp | 247 +++++++++++++++------ .../kernels/GCWeightsReshapeKernel.cpp | 11 +- 2 files changed, 183 insertions(+), 75 deletions(-) (limited to 'src/core/GLES_COMPUTE') diff --git a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp index a5f09e8eac..b4bb5470ad 100644 --- a/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCGEMMMatrixMultiplyKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2018 ARM Limited. + * Copyright (c) 2017-2018 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -31,37 +31,180 @@ #include "arm_compute/core/GLES_COMPUTE/IGCTensor.h" #include "arm_compute/core/GLES_COMPUTE/OpenGLES.h" #include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include #include using namespace arm_compute; using namespace arm_compute::gles_compute; +using namespace arm_compute::misc::shape_calculator; -GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel() - : _input0(nullptr), _input1(nullptr), _output(nullptr) +namespace { -} +using ElementsProcessed = Steps; -void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed) +inline Status validate_arguments(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1, output); + ARM_COMPUTE_UNUSED(reshape_info); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input0, input1, output); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input0, input1); if(!is_interleaved_transposed) { - ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); + ARM_COMPUTE_ERROR_ON(input0->dimension(0) != input1->dimension(1)); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(input1->dimension(0) != output->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(input0->dimension(1) != output->dimension(1)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + } + } + else + { + const int m = reshape_info.m(); + const int n = reshape_info.n(); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + TensorShape tensor_shape0{ input0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ input1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = input0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = input1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = input0->clone()->set_tensor_shape(compute_interleaved_shape(tensor_info0, mult_interleave4x4_height)); + const TensorInfo tensor_info_reshaped1 = input1->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(tensor_info1, mult_transpose1xW_width)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input1, &tensor_info_reshaped1); + + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(0) != static_cast(n)); + ARM_COMPUTE_RETURN_ERROR_ON(output->dimension(1) != static_cast(m)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input0, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_FIXED_POINT(input0, output); + } + } + + return Status{}; +} + +inline std::pair validate_and_configure_window(ITensorInfo *input0, ITensorInfo *input1, ITensorInfo *output, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, + GPUTarget gpu_target, ElementsProcessed &num_elements_processed) +{ + ARM_COMPUTE_UNUSED(gpu_target); + + // Output tensor auto inizialitation if not yet initialized + TensorShape tensor_shape{ input0->tensor_shape() }; + tensor_shape.set(0, is_interleaved_transposed ? reshape_info.n() : input1->dimension(0)); + tensor_shape.set(1, is_interleaved_transposed ? reshape_info.m() : input0->dimension(1)); + + auto_init_if_empty(*output, input0->clone()->set_tensor_shape(tensor_shape)); + + bool window_changed = false; + Window win{}; + + const DataType data_type = input0->data_type(); + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + + if(is_interleaved_transposed) + { + // Configure window kernel + num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = 4; + + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle input0_access(input0, 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); + AccessWindowTranspose input1_access(input1, 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + output_access.set_valid_region(win, ValidRegion(Coordinates(), output->tensor_shape())); + } + else // The input tensors have not been reshaped + { + // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor + + switch(data_type) + { + case DataType::F16: + num_elems_processed_per_iteration_x = 4; + num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); + break; + + case DataType::F32: + num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = std::min(static_cast(output->dimension(1)), 4); + break; + + default: + ARM_COMPUTE_ERROR("Current data type is not supported"); + break; + } + + win = calculate_max_window(*output, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic input0_access(input0, 0, 0, ceil_to_multiple(input0->dimension(0), 8), ceil_to_multiple(input0->dimension(1), num_elems_processed_per_iteration_y)); + AccessWindowStatic input1_access(input1, 0, 0, ceil_to_multiple(input1->dimension(0), num_elems_processed_per_iteration_x), input1->dimension(1)); + AccessWindowRectangle output_access(output, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + update_window_and_padding(win, input0_access, input1_access, output_access); + + Coordinates coord; + coord.set_num_dimensions(output->num_dimensions()); + output_access.set_valid_region(win, ValidRegion(coord, output->tensor_shape())); } + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +GCGEMMMatrixMultiplyKernel::GCGEMMMatrixMultiplyKernel() + : _input0(nullptr), _input1(nullptr), _output(nullptr) +{ +} + +void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTensor *input1, IGCTensor *output, float alpha, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input0, input1, output); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info)); + _input0 = input0; _input1 = input1; _output = output; + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(input0->info(), input1->info(), output->info(), is_interleaved_transposed, reshape_info, GPUTarget::UNKNOWN, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IGCKernel::configure(win_config.second); + + // Create build options std::set build_opts; + std::string kernel_name; Window win; build_opts.emplace("#define LOCAL_SIZE_X " + support::cpp11::to_string(1)); @@ -74,6 +217,12 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen // Check if the output tensor is a vector. If so,the kernel runs the vector-matrix multiplication if(is_interleaved_transposed) { + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + build_opts.emplace("#define MULT_TRANSPOSE1XW_WIDTH " + support::cpp11::to_string(mult_transpose1xW_width)); + build_opts.emplace("#define MULT_INTERLEAVE4X4_HEIGHT " + support::cpp11::to_string(mult_interleave4x4_height)); + switch(input0->info()->data_type()) { case DataType::F16: @@ -91,56 +240,20 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen build_opts.emplace("#define GEMM_MM_INTERLEAVED_TRANSPOSED"); - // Create kernel - _kernel = GCKernelLibrary::get().create_kernel(("gemm_mm_interleaved_transposed"), build_opts); - - // Configure window kernel - const unsigned int num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type()); - constexpr unsigned int num_elems_processed_per_iteration_y = 4; - - win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - - AccessWindowRectangle input0_access(input0->info(), 0, 0, num_elems_processed_per_iteration_y, 1, 1.f, 0.25f); - AccessWindowTranspose input1_access(input1->info(), 0, 0, num_elems_processed_per_iteration_x, 1, 0.f, 0.25f); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - update_window_and_padding(win, input0_access, input1_access, output_access); - - output_access.set_valid_region(win, ValidRegion(Coordinates(), output->info()->tensor_shape())); + kernel_name = "gemm_mm_interleaved_transposed"; } else { - ARM_COMPUTE_ERROR_ON(input0->info()->dimension(0) != input1->info()->dimension(1)); - // Special case for 1xN, 2xN, 3xN and 4xN input0 tensor - unsigned int num_elems_processed_per_iteration_x; - unsigned int num_elems_processed_per_iteration_y; switch(input0->info()->data_type()) { case DataType::F16: build_opts.emplace("#define DATA_TYPE_FP16"); - -#define MM_PROCESS_4X_OPTIMIZED - -#if defined(MM_PROCESS_4X) - num_elems_processed_per_iteration_x = 4; - num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); - build_opts.emplace("#define MM_PROCESS_4X"); -#elif defined(MM_PROCESS_4X_OPTIMIZED) /* MM_PROCESS_4X */ - num_elems_processed_per_iteration_x = 4; - num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); build_opts.emplace("#define MM_PROCESS_4X_OPTIMIZED"); -#elif defined(MM_PROCESS_8X) /* MM_PROCESS_4X */ - num_elems_processed_per_iteration_x = 8; - num_elems_processed_per_iteration_y = 1; - build_opts.emplace("#define MM_PROCESS_8X"); -#endif /* MM_PROCESS_4X */ break; case DataType::F32: - num_elems_processed_per_iteration_x = max_gc_vector_width / data_size_from_type(input0->info()->data_type()); - num_elems_processed_per_iteration_y = std::min(static_cast(output->info()->dimension(1)), 4); build_opts.emplace("#define DATA_TYPE_FP32"); break; @@ -150,31 +263,31 @@ void GCGEMMMatrixMultiplyKernel::configure(const IGCTensor *input0, const IGCTen } build_opts.emplace("#define GEMM_MM_FLOATING_POINT"); - build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elems_processed_per_iteration_x)); - build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elems_processed_per_iteration_y)); - - // Create kernel - _kernel = GCKernelLibrary::get().create_kernel("gemm_mm_floating_point", build_opts); + build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_X " + support::cpp11::to_string(num_elements_processed.x())); + build_opts.emplace("#define NUM_ELEMS_PROCESSED_PER_THREAD_Y " + support::cpp11::to_string(num_elements_processed.y())); - win = calculate_max_window(*output->info(), Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - -#if defined(MM_PROCESS_4X_OPTIMIZED) - AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), 8), ceil_to_multiple(input0->info()->dimension(1), num_elems_processed_per_iteration_y)); -#else /* MM_PROCESS_4X_OPTIMIZED */ - AccessWindowStatic input0_access(input0->info(), 0, 0, ceil_to_multiple(input0->info()->dimension(0), num_elems_processed_per_iteration_x), ceil_to_multiple(input0->info()->dimension(1), - num_elems_processed_per_iteration_y)); -#endif /* MM_PROCESS_4X_OPTIMIZED */ - AccessWindowStatic input1_access(input1->info(), 0, 0, ceil_to_multiple(input1->info()->dimension(0), num_elems_processed_per_iteration_x), input1->info()->dimension(1)); - AccessWindowRectangle output_access(output->info(), 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); - - update_window_and_padding(win, input0_access, input1_access, output_access); - - Coordinates coord; - coord.set_num_dimensions(output->info()->num_dimensions()); - output_access.set_valid_region(win, ValidRegion(coord, output->info()->tensor_shape())); + kernel_name = "gemm_mm_floating_point"; } - IGCKernel::configure(win); + // Create kernel + _kernel = GCKernelLibrary::get().create_kernel(kernel_name, build_opts); +} + +Status GCGEMMMatrixMultiplyKernel::validate(const ITensorInfo *input0, const ITensorInfo *input1, const ITensorInfo *output, float alpha, bool is_interleaved_transposed, + const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target) +{ + ARM_COMPUTE_UNUSED(alpha); + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input0, input1, output, is_interleaved_transposed, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input0->clone().get(), + input1->clone().get(), + output->clone().get(), + is_interleaved_transposed, + reshape_info, + gpu_target, + num_elements_processed) + .first); + return Status{}; } void GCGEMMMatrixMultiplyKernel::run(const Window &window) diff --git a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp index 4c08873dcf..55bf9b754b 100644 --- a/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp +++ b/src/core/GLES_COMPUTE/kernels/GCWeightsReshapeKernel.cpp @@ -31,11 +31,13 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/GLES_COMPUTE/GCHelpers.h" using namespace arm_compute; using namespace arm_compute::gles_compute; +using namespace arm_compute::misc::shape_calculator; GCWeightsReshapeKernel::GCWeightsReshapeKernel() : _input(nullptr), _biases(nullptr), _output(nullptr) @@ -47,15 +49,8 @@ void GCWeightsReshapeKernel::configure(const IGCTensor *input, const IGCTensor * ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_NULLPTR(output); - // Calculate output shape - TensorShape output_shape{ input->info()->tensor_shape() }; - output_shape.collapse(3); - const size_t tmp_dim = output_shape[0]; - output_shape.set(0, output_shape[1]); - output_shape.set(1, tmp_dim + (biases != nullptr ? 1 : 0)); - // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(output_shape)); + auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(compute_weights_reshaped_shape(*input->info(), (biases != nullptr)))); ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); -- cgit v1.2.1