From dbfc2dc182f90af5cad6fc283fff817ac7258a19 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 2 Apr 2019 12:51:21 +0100 Subject: COMPMID-2069: Rework CL ML layers to run exclusively on CL. Change-Id: If6cbf7a2e013d264e5d7f7cb54143ce32ba2687b Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/934 Comments-Addressed: Arm Jenkins Reviewed-by: Isabella Gottardi Reviewed-by: Gian Marco Iodice Tested-by: Arm Jenkins --- .../CL/functions/CLDeconvolutionLayerUpsample.h | 13 +++++++-- .../CL/functions/CLDirectDeconvolutionLayer.h | 8 +++--- arm_compute/runtime/CL/functions/CLLSTMLayer.h | 2 ++ .../CL/functions/CLDeconvolutionLayerUpsample.cpp | 28 ++++++------------ .../CL/functions/CLDirectDeconvolutionLayer.cpp | 22 +++++++++------ src/runtime/CL/functions/CLLSTMLayer.cpp | 33 +++++++++------------- 6 files changed, 51 insertions(+), 55 deletions(-) diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h index d2f8a78f87..3751178703 100644 --- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h +++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,6 +27,7 @@ #include "arm_compute/runtime/IFunction.h" #include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h" +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/IFunction.h" @@ -34,9 +35,14 @@ namespace arm_compute { +// Forward declarations class ICLTensor; -/** Basic function to run @ref CLDeconvolutionLayerUpsampleKernel */ +/** Basic function to execute deconvolution upsample on OpenCL. This function calls the following OpenCL kernels and functions: + * + * -# @ref CLMemsetKernel + * -# @ref CLDeconvolutionLayerUpsampleKernel + */ class CLDeconvolutionLayerUpsample : public IFunction { public: @@ -79,7 +85,8 @@ public: private: CLDeconvolutionLayerUpsampleKernel _upsample; + CLMemsetKernel _memset; ICLTensor *_output; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H__ */ diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h index 936263d635..b9a435abb2 100644 --- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h +++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h @@ -26,10 +26,9 @@ #include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h" #include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h" +#include "arm_compute/runtime/CL/functions/CLReverse.h" #include "arm_compute/runtime/CL/functions/CLTranspose.h" -#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h" - #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" #include "arm_compute/runtime/IFunction.h" @@ -70,7 +69,7 @@ class ICLTensor; * -# @ref CLConvolutionLayer * * And the following CPP kernels: - * -# @ref CPPFlipWeightsKernel + * -# @ref CLReverse * */ class CLDirectDeconvolutionLayer : public IFunction @@ -119,11 +118,12 @@ private: CLMemoryGroup _memory_group; CLDeconvolutionLayerUpsample _scale_f; CLConvolutionLayer _conv_f; - CPPFlipWeightsKernel _flip_weights; + CLReverse _flip_weights; CLTensor _scaled_output; ICLTensor *_original_weights; CLTensor _weights_flipped; + CLTensor _flip_axis; bool _is_prepared; }; diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h index a804a4af5b..8bd47cbf8e 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h @@ -29,6 +29,7 @@ #include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h" #include "arm_compute/core/CL/kernels/CLCopyKernel.h" #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" +#include "arm_compute/core/CL/kernels/CLMemsetKernel.h" #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" #include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h" #include "arm_compute/core/Types.h" @@ -188,6 +189,7 @@ private: CLWidthConcatenate2TensorsKernel _concat_weights_forget_gate; CLWidthConcatenate2TensorsKernel _concat_weights_input_gate; CLWidthConcatenate2TensorsKernel _concat_weights_output; + CLMemsetKernel _ones_memset_kernel; CLTensor _input_gate_out1; CLTensor _input_gate_out2; CLTensor _input_gate_out3; diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp index ce8667d656..c66dff01c5 100644 --- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp +++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2018 ARM Limited. + * Copyright (c) 2017-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -27,14 +27,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include -#include -#include - -using namespace arm_compute; - +namespace arm_compute +{ CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT : _upsample(), + _memset(), _output(nullptr) { } @@ -51,22 +48,13 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); _output = output; + _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info())); _upsample.configure(input, _output, inner_border, info); } void CLDeconvolutionLayerUpsample::run() { - _output->map(CLScheduler::get().queue(), true); - if(is_data_type_quantized_asymmetric(_output->info()->data_type())) - { - const uint8_t quantized_zero = _output->info()->quantization_info().offset; - std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero); - } - else - { - memset(_output->buffer(), 0, _output->info()->total_size()); - } - _output->unmap(CLScheduler::get().queue()); - - CLScheduler::get().enqueue(_upsample, false); + CLScheduler::get().enqueue(_memset, false); + CLScheduler::get().enqueue(_upsample, true); } +} // namespace arm_compute \ No newline at end of file diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp index c01588a164..ee76248e35 100644 --- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp +++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp @@ -28,7 +28,6 @@ #include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "arm_compute/runtime/CPP/CPPScheduler.h" #include "utils/TypePrinter.h" #include @@ -46,6 +45,7 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptrinit(TensorInfo(TensorShape(2U), 1, DataType::U32)); _weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout)); - _flip_weights.configure(weights, &_weights_flipped); + _flip_weights.configure(weights, &_weights_flipped, &_flip_axis); auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h), info.pad().first, info.pad().second, stride_x, stride_y); @@ -151,10 +152,18 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights, const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2); _scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info); - // setup the function to convolve the upscaled output + // Setup the function to convolve the upscaled output const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL); _conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info); _scaled_output.allocator()->allocate(); + + // Setup flip axis data + _flip_axis.allocator()->allocate(); + _flip_axis.map(true); + auto axis_data = reinterpret_cast(_flip_axis.buffer()); + axis_data[0] = 0; + axis_data[1] = 1; + _flip_axis.unmap(); } void CLDirectDeconvolutionLayer::run() @@ -177,16 +186,13 @@ void CLDirectDeconvolutionLayer::prepare() // Run weights flipping and mark original weights tensor as unused _weights_flipped.allocator()->allocate(); - _weights_flipped.map(true); - _original_weights->map(CLScheduler::get().queue(), true); - CPPScheduler::get().schedule(&_flip_weights, Window::DimZ); - _weights_flipped.unmap(); - _original_weights->unmap(CLScheduler::get().queue()); + _flip_weights.run(); _original_weights->mark_as_unused(); // Prepare convolution _conv_f.prepare(); + // Free flipped weights if(!_weights_flipped.is_used()) { _weights_flipped.allocator()->free(); diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index a004762a4e..13c4871148 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -44,9 +44,10 @@ CLLSTMLayer::CLLSTMLayer(std::shared_ptr memory_manager) _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(), _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(), _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), - _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), - _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), - _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false) + _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), + _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), + _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), + _is_prepared(false) { } @@ -104,7 +105,7 @@ void CLLSTMLayer::configure(const ICLTensor *input, std::vector inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type())); _memory_group.manage(&_forget_gate_out2); @@ -114,7 +115,7 @@ void CLLSTMLayer::configure(const ICLTensor *input, weights_vector.emplace_back(input_to_forget_weights); weights_vector.emplace_back(recurrent_to_forget_weights); - const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(weights_vector); + const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0); _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6); @@ -155,6 +156,7 @@ void CLLSTMLayer::configure(const ICLTensor *input, { _memory_group.manage(&_input_gate_out1); _ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); + _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type())); _subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE); _ones.allocator()->allocate(); _run_cifg_opt = true; @@ -167,7 +169,7 @@ void CLLSTMLayer::configure(const ICLTensor *input, std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(lstm_weights); + TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2); @@ -237,7 +239,7 @@ void CLLSTMLayer::configure(const ICLTensor *input, std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(in_out_weights); + TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2); @@ -392,7 +394,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector inputs_vector; inputs_vector.emplace_back(input); inputs_vector.emplace_back(output_state_in); - const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0); TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat)); @@ -417,7 +419,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector lstm_weights; lstm_weights.emplace_back(lstm_params.input_to_input_weights()); lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); - TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(lstm_weights); + TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0); TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat)); @@ -454,7 +456,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, std::vector in_out_weights; in_out_weights.emplace_back(input_to_output_weights); in_out_weights.emplace_back(recurrent_to_output_weights); - TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(in_out_weights); + TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0); TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat)); // Validate output gate tmp @@ -518,16 +520,7 @@ void CLLSTMLayer::run() if(_run_cifg_opt) { - _ones.map(true); - if(_ones.info()->data_type() == DataType::F16) - { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); - } - else - { - std::fill_n(reinterpret_cast(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1); - } - _ones.unmap(); + CLScheduler::get().enqueue(_ones_memset_kernel); CLScheduler::get().enqueue(_subtract_input_gate); } else -- cgit v1.2.1