aboutsummaryrefslogtreecommitdiff
path: root/src/runtime
diff options
context:
space:
mode:
authorGiorgio Arena <giorgio.arena@arm.com>2018-05-03 15:57:48 +0100
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:52:35 +0000
commita3221e6772dc371cf5de7e525bf5c22b58ad6d08 (patch)
tree14d224e07d92dbbd97966de0b6b0aa8e6a288022 /src/runtime
parent20b4313365ea2ed31f59fd757f68f791f076e6bc (diff)
downloadComputeLibrary-a3221e6772dc371cf5de7e525bf5c22b58ad6d08.tar.gz
COMPMID-1106 Add fast math support in NEWinogradConvolutionLayer
Change-Id: I5fcbbb3b6f22204f0aaebbc319dfdf03593577e8 Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/130067 Tested-by: Jenkins <bsgcomp@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Diffstat (limited to 'src/runtime')
-rw-r--r--src/runtime/NEON/functions/NEConvolutionLayer.cpp32
-rw-r--r--src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp98
2 files changed, 80 insertions, 50 deletions
diff --git a/src/runtime/NEON/functions/NEConvolutionLayer.cpp b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
index 69fb948d3f..1c294f8f5d 100644
--- a/src/runtime/NEON/functions/NEConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEConvolutionLayer.cpp
@@ -41,18 +41,19 @@ NEConvolutionLayer::NEConvolutionLayer(std::shared_ptr<IMemoryManager> memory_ma
}
void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
- const Size2D &dilation, const ActivationLayerInfo &act_info)
+ const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
// Perform validate step
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info));
+ ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayer::validate(input->info(), weights->info(), ((biases != nullptr) ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info,
+ enable_fast_math));
switch(NEConvolutionLayer::get_convolution_method(input->info(), weights->info(), output->info(), conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
{
auto f = arm_compute::support::cpp14::make_unique<NEWinogradConvolutionLayer>(_memory_manager);
- f->configure(input, weights, biases, output, conv_info, act_info);
+ f->configure(input, weights, biases, output, conv_info, act_info, enable_fast_math);
_function = std::move(f);
break;
}
@@ -77,13 +78,13 @@ void NEConvolutionLayer::configure(ITensor *input, const ITensor *weights, const
}
Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
switch(NEConvolutionLayer::get_convolution_method(input, weights, output, conv_info, weights_info, dilation, act_info))
{
case ConvolutionMethod::WINOGRAD:
//Validate Winograd
- NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info);
+ NEWinogradConvolutionLayer::validate(input, weights, biases, output, conv_info, act_info, enable_fast_math);
break;
case ConvolutionMethod::GEMM:
//Validate Gemm-based Convolution
@@ -102,27 +103,18 @@ Status NEConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo
ConvolutionMethod NEConvolutionLayer::get_convolution_method(const ITensorInfo *input, const ITensorInfo *weights,
const ITensorInfo *output, const PadStrideInfo &conv_info,
- const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info)
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output, weights);
- ARM_COMPUTE_UNUSED(output);
ARM_COMPUTE_UNUSED(weights_info);
- const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
- const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- const size_t idx_c = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL);
-
- if((input->data_type() == DataType::F32) && (input->data_layout() == DataLayout::NCHW) && (input->dimension(idx_c) > 16) && (weights->dimension(idx_w) == 3) && (weights->dimension(idx_h) == 3)
- && (weights->num_dimensions() <= 4)
- && (conv_info.stride().first == 1) && (conv_info.stride().second == 1) && (dilation == Size2D(1U, 1U)) && (!act_info.enabled()))
+ if(dilation != Size2D(1U, 1U) || Scheduler::get().cpu_info().get_cpu_model() == CPUModel::A53
+ || input->dimension(get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::CHANNEL)) <= 16)
{
- //FIXME Until COMPMID-1041 is implemented Winograd is slower than GEMM on A53.
- if(Scheduler::get().cpu_info().get_cpu_model() != CPUModel::A53)
- {
- return ConvolutionMethod::WINOGRAD;
- }
+ return ConvolutionMethod::GEMM;
}
- return ConvolutionMethod::GEMM;
+
+ return bool(NEWinogradConvolutionLayer::validate(input, weights, nullptr, output, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM;
}
void NEConvolutionLayer::run()
diff --git a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
index d745f42f1a..8f2c4c4361 100644
--- a/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEWinogradConvolutionLayer.cpp
@@ -74,6 +74,39 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights,
return Status{};
}
+
+Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims)
+{
+ Size2D output_tile = Size2D{};
+
+ if(kernel_dims == Size2D(3U, 3U))
+ {
+ output_tile = (input_dims.width <= 4 && input_dims.height <= 4) ? Size2D(2U, 2U) : Size2D(4U, 4U);
+ }
+ else if(kernel_dims == Size2D(5U, 5U))
+ {
+ output_tile = Size2D(2U, 2U);
+ }
+
+ return output_tile;
+}
+
+bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size)
+{
+ // Check if we want to configure a Winograd configuration which requires fast math
+ using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>;
+
+ std::vector<WinogradConfiguration> fast_math_winograd =
+ {
+ WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(5, 5)),
+ WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5))
+ };
+
+ auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height),
+ std::pair<int, int>(kernel_size.width, kernel_size.height));
+
+ return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end();
+}
} //namespace
NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryManager> memory_manager)
@@ -83,33 +116,40 @@ NEWinogradConvolutionLayer::NEWinogradConvolutionLayer(std::shared_ptr<IMemoryMa
{
} /* arm_compute */
-void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info)
+void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info,
+ bool enable_fast_math)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), weights->info(), (biases != nullptr) ? biases->info() : nullptr, output->info(), conv_info));
- _weights = weights;
- _input = input;
- _output = output;
-
// Get indices for the width and height
const DataLayout data_layout = input->info()->data_layout();
const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const Size2D input_dims = Size2D(input->info()->dimension(width_idx), input->info()->dimension(height_idx));
+ const Size2D kernel_size = Size2D(weights->info()->dimension(width_idx), weights->info()->dimension(height_idx));
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
+
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
+
+ _weights = weights;
+ _input = input;
+ _output = output;
+
std::unique_ptr<INEWinogradLayerTransformInputKernel<float>> transform_input_kernel;
std::unique_ptr<INEWinogradLayerTransformWeightsKernel<float>> transform_weights_kernel;
std::unique_ptr<INEWinogradLayerTransformOutputKernel<float>> transform_output_kernel;
- const int weights_width = weights->info()->dimension(width_idx);
- const int weights_height = weights->info()->dimension(height_idx);
+ int n_gemms = 0;
+ int N_BLOCK = 0; // Size of block used by GEMM.
- Size2D output_tile{};
- int n_gemms = 0;
- int N_BLOCK = 0; // Size of block used by GEMM.
-
- switch(weights_width)
+ switch(kernel_size.width)
{
case 3:
{
@@ -118,7 +158,6 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>>();
transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>>();
transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>>();
- output_tile = Size2D(4U, 4U);
n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradBase::N_GEMMS;
N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 4, 4, 3, 3>::WinogradConv::N_BLOCK;
}
@@ -127,7 +166,6 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 3, 3>>();
transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 3, 3>>();
transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 3, 3>>();
- output_tile = Size2D(2U, 2U);
n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradBase::N_GEMMS;
N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 3, 3>::WinogradConv::N_BLOCK;
}
@@ -138,7 +176,6 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
transform_input_kernel = support::cpp14::make_unique<NEWinogradLayerTransformInputKernel<float, 2, 2, 5, 5>>();
transform_weights_kernel = support::cpp14::make_unique<NEWinogradLayerTransformWeightsKernel<float, 2, 2, 5, 5>>();
transform_output_kernel = support::cpp14::make_unique<NEWinogradLayerTransformOutputKernel<float, 2, 2, 5, 5>>();
- output_tile = Size2D(2U, 2U);
n_gemms = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradBase::N_GEMMS;
N_BLOCK = NEWinogradLayerBatchedGEMMKernel<float, float, 2, 2, 5, 5>::WinogradConv::N_BLOCK;
break;
@@ -189,7 +226,7 @@ void NEWinogradConvolutionLayer::configure(const ITensor *input, const ITensor *
_permute_input.configure(input, &_input_nhwc, PermutationVector(2U, 0U, 1U));
_input_nhwc.allocator()->allocate();
- const KernelShape kernel_shape({ out_channels, weights_height, weights_width, in_channels });
+ const KernelShape kernel_shape({ out_channels, static_cast<int>(kernel_size.height), static_cast<int>(kernel_size.width), in_channels });
// Configure the InputTransform
const int input_matrix_stride = transform_input_kernel->get_matrix_stride(kernel_shape, in_shape, use_padding_type);
@@ -292,7 +329,7 @@ void NEWinogradConvolutionLayer::run()
}
Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
- const ActivationLayerInfo &act_info)
+ const ActivationLayerInfo &act_info, bool enable_fast_math)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, weights, biases, output, conv_info));
@@ -300,20 +337,21 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
// Get indices for the width and height
const size_t idx_width = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH);
const size_t idx_height = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT);
- // Input shape
- const TensorShape input_shape = input->tensor_shape();
- const unsigned int input_w = input_shape[idx_width];
- const unsigned int input_h = input_shape[idx_height];
- // Kernel size
- const unsigned int kernel_w = weights->tensor_shape()[idx_width];
- const unsigned int kernel_h = weights->tensor_shape()[idx_height];
+ // Input shape, kernel size and output tile
+ const Size2D input_dims = Size2D(input->dimension(idx_width), input->dimension(idx_height));
+ const Size2D kernel_size = Size2D(weights->dimension(idx_width), weights->dimension(idx_height));
+ const Size2D output_tile = winograd_output_tile(input_dims, kernel_size);
- const Size2D output_tile = (Size2D(kernel_w, kernel_h) == Size2D(3U, 3U) && input_w > 4 && input_h > 4) ? Size2D(4U, 4U) : Size2D(2U, 2U);
+ // Check if the Winograd configuration requires fast math
+ if(!enable_fast_math)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true");
+ }
const WinogradInfo winograd_info = WinogradInfo(output_tile,
- Size2D(kernel_w, kernel_h),
- Size2D(input_shape[idx_width], input_shape[idx_height]),
+ kernel_size,
+ input_dims,
conv_info,
input->data_layout());
@@ -324,7 +362,7 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
{
case 3:
{
- if(input_w > 4 && input_h > 4)
+ if(input_dims.width > 4 && input_dims.height > 4)
{
ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformInputKernel<float, 4, 4, 3, 3>::validate(input, &input0, winograd_info)));
}
@@ -353,7 +391,7 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
{
case 3:
{
- if(input_w > 4 && input_h > 4)
+ if(input_dims.width > 4 && input_dims.height > 4)
{
ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformWeightsKernel<float, 4, 4, 3, 3>::validate(weights, &input1, winograd_info)));
}
@@ -382,7 +420,7 @@ Status NEWinogradConvolutionLayer::validate(const ITensorInfo *input, const ITen
{
case 3:
{
- if(input_w > 4 && input_h > 4)
+ if(input_dims.width > 4 && input_dims.height > 4)
{
// Validate output transform
ARM_COMPUTE_RETURN_ON_ERROR((NEWinogradLayerTransformOutputKernel<float, 4, 4, 3, 3>::validate(&batched_mm_output, biases, output, winograd_info)));