From 56e8e8627e5c6912a482a68ec4d65051a5ff4ee8 Mon Sep 17 00:00:00 2001 From: Sam Laynton Date: Thu, 5 Apr 2018 13:26:08 +0100 Subject: COMPMID-1031: Use LWS hints for G51, G51BIG, G51LIT, and TNOX Change-Id: Ie07d9225faaef778bdcfdcb56ae42ec95962e48d Signed-off-by: Sam Laynton Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/126735 Reviewed-by: Michalis Spyrou Tested-by: Jenkins --- src/core/CL/kernels/CLCol2ImKernel.cpp | 2 +- src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp | 2 +- src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp | 2 +- src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp | 4 ++-- src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp | 2 +- src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp | 6 +++++- src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp | 2 +- src/core/CL/kernels/CLIm2ColKernel.cpp | 2 +- src/core/CL/kernels/CLPoolingLayerKernel.cpp | 2 +- 9 files changed, 14 insertions(+), 10 deletions(-) (limited to 'src/core/CL/kernels') diff --git a/src/core/CL/kernels/CLCol2ImKernel.cpp b/src/core/CL/kernels/CLCol2ImKernel.cpp index c66d64332a..c5bd7d2151 100644 --- a/src/core/CL/kernels/CLCol2ImKernel.cpp +++ b/src/core/CL/kernels/CLCol2ImKernel.cpp @@ -112,7 +112,7 @@ void CLCol2ImKernel::configure(const ICLTensor *input, ICLTensor *output, std::p // Configure the local work size for Bifrost with a value obtained // via exhaustive autotuning over 30 representative tensor shapes. const GPUTarget gpu_target = get_target(); - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72)) + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX)) { if((_convolved_dims.first == 7) || (_convolved_dims.first == 14)) { diff --git a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp index 9e5585cba4..c9fe1cfc0b 100644 --- a/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseConvolutionLayer3x3Kernel.cpp @@ -147,7 +147,7 @@ void CLDepthwiseConvolutionLayer3x3Kernel::configure(const ICLTensor *input, con // Configure the local work size for Bifrost with a value obtained // via exhaustive autotuning for the MobileNets tensor shapes. const GPUTarget gpu_target = get_target(); - const bool is_bifrost = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72); + const bool is_bifrost = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX); // Configure kernel window unsigned int num_elems_read_per_iteration_x = 0; diff --git a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp index 18d64a1a9d..a0784dcad6 100644 --- a/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLDepthwiseIm2ColKernel.cpp @@ -79,7 +79,7 @@ void CLDepthwiseIm2ColKernel::configure(const ICLTensor *input, ICLTensor *outpu // via exhaustive autotuning for the MobileNets tensor shapes. const GPUTarget gpu_target = get_target(); - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72)) + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX)) { _lws_hint = cl::NDRange(1, 2, 1); } diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp index b5526c4fca..13ee9a1d14 100644 --- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp +++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp @@ -134,7 +134,7 @@ std::pair validate_and_configure_window(ITensorInfo *input, ITen unsigned int num_elems_written_per_iteration_x = 0; unsigned int num_elems_written_per_iteration_y = 0; - if(gpu_target_is_in(target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32)) + if(gpu_target_is_in(target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && (kernel_size <= 5) && (conv_stride_x == 1) && (conv_stride_y == 1) && (data_type == DataType::F32)) { // Configure kernel window @@ -309,7 +309,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL CLBuildOptions build_options; build_options.add_option_if(_biases != nullptr, std::string("-DHAS_BIAS")); - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32)) + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && (kernel_size <= 5) && (_conv_stride_x == 1) && (_conv_stride_y == 1) && (data_type == DataType::F32)) { build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(_weights->info()->dimension(2)))); diff --git a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp index 3309775c36..d409fdbc87 100644 --- a/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixAccumulateBiasesKernel.cpp @@ -52,7 +52,7 @@ std::pair validate_and_configure_window(ITensorInfo *accum, ITen unsigned int &num_elems_processed_per_iteration) { // Select the vector size to use (8 for Bifrost; 16 for Midgard). - num_elems_processed_per_iteration = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) ? 8 : 16; + num_elems_processed_per_iteration = gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) ? 8 : 16; // Configure kernel window Window win = calculate_max_window(*accum, Steps(num_elems_processed_per_iteration)); diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp index dc9c59d2d0..0f0646c1ce 100644 --- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp @@ -219,6 +219,10 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen break; case GPUTarget::G71: case GPUTarget::G72: + case GPUTarget::G51: + case GPUTarget::G51BIG: + case GPUTarget::G51LIT: + case GPUTarget::TNOX: if(input1->info()->dimension(1) == 24) { // LWS optimized for the 11x11 AlexNet convolution on Bifrost. @@ -284,7 +288,7 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(input0->info()->dimension(0))); // Create kernels according to the architecture, data type and input size. - if((gpu_target == GPUTarget::G71 || gpu_target == GPUTarget::G72) && data_type == DataType::F32) + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && data_type == DataType::F32) { // The first kernel is optimized for the case of 1000 or less output elements (e.g. FC8 of AlexNet and VGG-16, and // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 output elements (e.g. diff --git a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp index 87e624cc74..a1e47f28d4 100644 --- a/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp +++ b/src/core/CL/kernels/CLGEMMMatrixVectorMultiplyKernel.cpp @@ -78,7 +78,7 @@ void CLGEMMMatrixVectorMultiplyKernel::configure(const ICLTensor *input0, const // Configure the local work size for Bifrost with a value obtained // via exhaustive autotuning for the MobileNets tensor shapes. const GPUTarget gpu_target = get_target(); - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72)) + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX)) { _lws_hint = cl::NDRange(1, 1, 1); } diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp index cc19d3c263..d04c1dc00d 100644 --- a/src/core/CL/kernels/CLIm2ColKernel.cpp +++ b/src/core/CL/kernels/CLIm2ColKernel.cpp @@ -190,7 +190,7 @@ void CLIm2ColKernel::configure(const ICLTensor *input, ICLTensor *output, const vector_size = kernel_dims.width; } // Local work size and vector size optimized for the 11x11 AlexNet convolution on Bifrost. - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72) && kernel_dims.width == 11) + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX) && kernel_dims.width == 11) { _lws_hint = cl::NDRange(1, 1, 1); vector_size = 8; diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp index d7b86e78f6..b4deec1386 100644 --- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp +++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp @@ -240,7 +240,7 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output, // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with). - if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72)) + if(gpu_target_is_in(gpu_target, GPUTarget::G71, GPUTarget::G72, GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, GPUTarget::TNOX)) { cl::NDRange gws = ICLKernel::gws_from_window(std::get<1>(win_config)); _lws_hint = cl::NDRange(gws[0], gws[1], 1); -- cgit v1.2.1