diff options
Diffstat (limited to 'src/core/CL/kernels')
4 files changed, 26 insertions, 24 deletions
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index 0a0de7ad4e..805a594af6 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -286,17 +286,23 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen else // The input tensors have not been reshaped { build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); // Create kernels according to the architecture, data type and input size. if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && is_data_type_float(data_type)) { - kernel_name = "gemm_mm_floating_point_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; - // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and - // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g. - // FC6 and FC7 of AlexNet and VGG-16). - if(input1->info()->dimension(0) <= 1000 && input0->info()->num_dimensions() == 1 && data_type == DataType::F32) + kernel_name = "gemm_mm_floating_point"; + + if(input0->info()->num_dimensions() != 1) + { + kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; + } + else if(input1->info()->dimension(0) <= 1000 && data_type == DataType::F32) { - kernel_name += "_1000"; + // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and + // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g. + // FC6 and FC7 of AlexNet and VGG-16). + kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000"; } // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels @@ -309,7 +315,6 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen } else // (MIDGARD and F32) or (F16) { - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); kernel_name = "gemm_mm_floating_point"; } build_opts.add_option("-DNUM_ELEMS_PROCESSED_PER_THREAD_Y=" + support::cpp11::to_string(num_elements_processed.y())); diff --git a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp index d3a33c01f9..41b3ac50b5 100644 --- a/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradFilterTransformKernel.cpp @@ -55,9 +55,11 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U)); - ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) && output_tile_size != Size2D(4U, 4U)); - ARM_COMPUTE_RETURN_ERROR_ON(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd filter transform only supports 3x3 and 5x5 kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) + && output_tile_size != Size2D(4U, 4U), + "Winograd filter transform only supports 2x2 or 4x4 output tile for 3x3 kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd filter transform only supports 4x4 output tile for 5x5 kernels"); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); diff --git a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp index a47590d20f..febd22b04e 100644 --- a/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradInputTransformKernel.cpp @@ -47,7 +47,9 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const Size2D kernel_size = winograd_info.kernel_size; ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Winograd input transform only supports 3x3 and 5x5 kernels"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U), "Winograd input transform only supports 2x2 output tile for 3x3 kernels"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size != Size2D(2U, 2U) + && output_tile_size != Size2D(4U, 4U), + "Winograd input transform only supports 2x2 or 4x4 output tile for 3x3 kernels"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size != Size2D(4U, 4U), "Winograd input transform only supports 4x4 output tile for 5x5 kernels"); ARM_COMPUTE_UNUSED(conv_info); ARM_COMPUTE_UNUSED(output_tile_size); @@ -111,7 +113,6 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor const int num_elements_y = input->info()->dimension(1) - (kernel_size.height - 1) + conv_info.pad_top() + conv_info.pad_bottom(); // Check if we need to extend the right or bottom border - // FIXME: This actually is not needed. Added just for validating the result; const unsigned int extra_border_right = ((num_elements_x % output_tile_size.width) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.width - 1); const unsigned int extra_border_bottom = ((num_elements_y % output_tile_size.height) == 0) ? 0u : static_cast<unsigned int>(output_tile_size.height - 1); @@ -137,19 +138,13 @@ void CLWinogradInputTransformKernel::configure(const ICLTensor *input, ICLTensor std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); // Check optimized kernel if output_dims == 2x2 - if(output_tile_size.width == 2 && output_tile_size.height == 2) + if(output_tile_size == Size2D(2U, 2U)) { - if((_input->info()->dimension(2) % 2) != 0) - { - _step_z = 1; - } - else - { - _step_z = 2; - _lws_hint = cl::NDRange(1, 1, 8); - } + _step_z = (_input->info()->dimension(2) % 2) != 0 ? 1 : 2; } + _lws_hint = cl::NDRange(1, 1, 8); + // Append stepz and data layout kernel_name += "_stepz"; kernel_name += support::cpp11::to_string(_step_z); diff --git a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp index 8ee1a82209..c5d2528aa2 100644 --- a/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp +++ b/src/core/CL/kernels/CLWinogradOutputTransformKernel.cpp @@ -58,6 +58,7 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size != Size2D(3U, 3U) && kernel_size != Size2D(5U, 5U), "Only 3x3 and 5x5 kernels are supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(2U, 2U) && input->dimension(2) != 16, "Wrong number of batches"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(3U, 3U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 36, "Wrong number of batches"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(kernel_size == Size2D(5U, 5U) && output_tile_size == Size2D(4U, 4U) && input->dimension(2) != 64, "Wrong number of batches"); // Compute number of elements to process in the X and Y direction @@ -67,7 +68,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con const int num_tiles_y = std::ceil(num_elements_y / static_cast<float>(output_tile_size.height)); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles_x * num_tiles_y))); - ARM_COMPUTE_UNUSED(output_tile_size); if(bias != nullptr) { @@ -207,4 +207,4 @@ void CLWinogradOutputTransformKernel::run(const Window &window, cl::CommandQueue enqueue(queue, *this, slice, _lws_hint); } while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); -}
\ No newline at end of file +} |