aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorGeorgios Pinitas <georgios.pinitas@arm.com>2019-04-02 12:51:21 +0100
committerGiuseppe Rossini <giuseppe.rossini@arm.com>2019-04-02 16:23:17 +0000
commitdbfc2dc182f90af5cad6fc283fff817ac7258a19 (patch)
tree5bf598dc0ddd76f60ce95da369e69300f3300670
parent881c6842eadf2d2fd4578b9f62ee6238a83cad65 (diff)
downloadComputeLibrary-dbfc2dc182f90af5cad6fc283fff817ac7258a19.tar.gz
COMPMID-2069: Rework CL ML layers to run exclusively on CL.
Change-Id: If6cbf7a2e013d264e5d7f7cb54143ce32ba2687b Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com> Reviewed-on: https://review.mlplatform.org/c/934 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Isabella Gottardi <isabella.gottardi@arm.com> Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h13
-rw-r--r--arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h8
-rw-r--r--arm_compute/runtime/CL/functions/CLLSTMLayer.h2
-rw-r--r--src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp28
-rw-r--r--src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp22
-rw-r--r--src/runtime/CL/functions/CLLSTMLayer.cpp33
6 files changed, 51 insertions, 55 deletions
diff --git a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
index d2f8a78f87..3751178703 100644
--- a/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
+++ b/arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,6 +27,7 @@
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/core/CL/kernels/CLDeconvolutionLayerUpsampleKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
#include "arm_compute/core/Types.h"
#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/IFunction.h"
@@ -34,9 +35,14 @@
namespace arm_compute
{
+// Forward declarations
class ICLTensor;
-/** Basic function to run @ref CLDeconvolutionLayerUpsampleKernel */
+/** Basic function to execute deconvolution upsample on OpenCL. This function calls the following OpenCL kernels and functions:
+ *
+ * -# @ref CLMemsetKernel
+ * -# @ref CLDeconvolutionLayerUpsampleKernel
+ */
class CLDeconvolutionLayerUpsample : public IFunction
{
public:
@@ -79,7 +85,8 @@ public:
private:
CLDeconvolutionLayerUpsampleKernel _upsample;
+ CLMemsetKernel _memset;
ICLTensor *_output;
};
-}
+} // namespace arm_compute
#endif /* __ARM_COMPUTE_CLDECONVOLUTIONLAYERUPSAMPLE_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
index 936263d635..b9a435abb2 100644
--- a/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
+++ b/arm_compute/runtime/CL/functions/CLDirectDeconvolutionLayer.h
@@ -26,10 +26,9 @@
#include "arm_compute/runtime/CL/functions/CLConvolutionLayer.h"
#include "arm_compute/runtime/CL/functions/CLDeconvolutionLayerUpsample.h"
+#include "arm_compute/runtime/CL/functions/CLReverse.h"
#include "arm_compute/runtime/CL/functions/CLTranspose.h"
-#include "arm_compute/core/CPP/kernels/CPPFlipWeightsKernel.h"
-
#include "arm_compute/runtime/CL/CLMemoryGroup.h"
#include "arm_compute/runtime/CL/CLTensor.h"
#include "arm_compute/runtime/IFunction.h"
@@ -70,7 +69,7 @@ class ICLTensor;
* -# @ref CLConvolutionLayer
*
* And the following CPP kernels:
- * -# @ref CPPFlipWeightsKernel
+ * -# @ref CLReverse
*
*/
class CLDirectDeconvolutionLayer : public IFunction
@@ -119,11 +118,12 @@ private:
CLMemoryGroup _memory_group;
CLDeconvolutionLayerUpsample _scale_f;
CLConvolutionLayer _conv_f;
- CPPFlipWeightsKernel _flip_weights;
+ CLReverse _flip_weights;
CLTensor _scaled_output;
ICLTensor *_original_weights;
CLTensor _weights_flipped;
+ CLTensor _flip_axis;
bool _is_prepared;
};
diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
index a804a4af5b..8bd47cbf8e 100644
--- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h
+++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h
@@ -29,6 +29,7 @@
#include "arm_compute/core/CL/kernels/CLActivationLayerKernel.h"
#include "arm_compute/core/CL/kernels/CLCopyKernel.h"
#include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h"
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
#include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h"
#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h"
#include "arm_compute/core/Types.h"
@@ -188,6 +189,7 @@ private:
CLWidthConcatenate2TensorsKernel _concat_weights_forget_gate;
CLWidthConcatenate2TensorsKernel _concat_weights_input_gate;
CLWidthConcatenate2TensorsKernel _concat_weights_output;
+ CLMemsetKernel _ones_memset_kernel;
CLTensor _input_gate_out1;
CLTensor _input_gate_out2;
CLTensor _input_gate_out3;
diff --git a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
index ce8667d656..c66dff01c5 100644
--- a/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
+++ b/src/runtime/CL/functions/CLDeconvolutionLayerUpsample.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2018 ARM Limited.
+ * Copyright (c) 2017-2019 ARM Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -27,14 +27,11 @@
#include "arm_compute/core/Utils.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include <cmath>
-#include <memory>
-#include <tuple>
-
-using namespace arm_compute;
-
+namespace arm_compute
+{
CLDeconvolutionLayerUpsample::CLDeconvolutionLayerUpsample() // NOLINT
: _upsample(),
+ _memset(),
_output(nullptr)
{
}
@@ -51,22 +48,13 @@ void CLDeconvolutionLayerUpsample::configure(ICLTensor *input, ICLTensor *output
ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
_output = output;
+ _memset.configure(_output, PixelValue(0, _output->info()->data_type(), _output->info()->quantization_info()));
_upsample.configure(input, _output, inner_border, info);
}
void CLDeconvolutionLayerUpsample::run()
{
- _output->map(CLScheduler::get().queue(), true);
- if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
- {
- const uint8_t quantized_zero = _output->info()->quantization_info().offset;
- std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
- }
- else
- {
- memset(_output->buffer(), 0, _output->info()->total_size());
- }
- _output->unmap(CLScheduler::get().queue());
-
- CLScheduler::get().enqueue(_upsample, false);
+ CLScheduler::get().enqueue(_memset, false);
+ CLScheduler::get().enqueue(_upsample, true);
}
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
index c01588a164..ee76248e35 100644
--- a/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLDirectDeconvolutionLayer.cpp
@@ -28,7 +28,6 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/CL/CLScheduler.h"
-#include "arm_compute/runtime/CPP/CPPScheduler.h"
#include "utils/TypePrinter.h"
#include <memory>
@@ -46,6 +45,7 @@ CLDirectDeconvolutionLayer::CLDirectDeconvolutionLayer(std::shared_ptr<IMemoryMa
_scaled_output(),
_original_weights(nullptr),
_weights_flipped(),
+ _flip_axis(),
_is_prepared(false)
{
}
@@ -120,8 +120,9 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights,
const size_t idx_h = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
_original_weights = weights;
+ _flip_axis.allocator()->init(TensorInfo(TensorShape(2U), 1, DataType::U32));
_weights_flipped.allocator()->init(weights->info()->clone()->set_data_layout(data_layout));
- _flip_weights.configure(weights, &_weights_flipped);
+ _flip_weights.configure(weights, &_weights_flipped, &_flip_axis);
auto out_dims = deconvolution_output_dimensions(input->info()->dimension(idx_w), input->info()->dimension(idx_h), weights->info()->dimension(idx_w), weights->info()->dimension(idx_h),
info.pad().first, info.pad().second, stride_x, stride_y);
@@ -151,10 +152,18 @@ void CLDirectDeconvolutionLayer::configure(ICLTensor *input, ICLTensor *weights,
const PadStrideInfo upsample_info(stride_x, stride_y, padx / 2, pady / 2);
_scale_f.configure(input, &_scaled_output, BorderSize(), upsample_info);
- // setup the function to convolve the upscaled output
+ // Setup the function to convolve the upscaled output
const PadStrideInfo conv_info(1, 1, 0, 0, 0, 0, DimensionRoundingType::CEIL);
_conv_f.configure(&_scaled_output, &_weights_flipped, bias, output, conv_info, weights_info);
_scaled_output.allocator()->allocate();
+
+ // Setup flip axis data
+ _flip_axis.allocator()->allocate();
+ _flip_axis.map(true);
+ auto axis_data = reinterpret_cast<uint32_t *>(_flip_axis.buffer());
+ axis_data[0] = 0;
+ axis_data[1] = 1;
+ _flip_axis.unmap();
}
void CLDirectDeconvolutionLayer::run()
@@ -177,16 +186,13 @@ void CLDirectDeconvolutionLayer::prepare()
// Run weights flipping and mark original weights tensor as unused
_weights_flipped.allocator()->allocate();
- _weights_flipped.map(true);
- _original_weights->map(CLScheduler::get().queue(), true);
- CPPScheduler::get().schedule(&_flip_weights, Window::DimZ);
- _weights_flipped.unmap();
- _original_weights->unmap(CLScheduler::get().queue());
+ _flip_weights.run();
_original_weights->mark_as_unused();
// Prepare convolution
_conv_f.prepare();
+ // Free flipped weights
if(!_weights_flipped.is_used())
{
_weights_flipped.allocator()->free();
diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp
index a004762a4e..13c4871148 100644
--- a/src/runtime/CL/functions/CLLSTMLayer.cpp
+++ b/src/runtime/CL/functions/CLLSTMLayer.cpp
@@ -44,9 +44,10 @@ CLLSTMLayer::CLLSTMLayer(std::shared_ptr<IMemoryManager> memory_manager)
_pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(),
_accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(),
_projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(),
- _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(),
- _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(),
- _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false)
+ _ones_memset_kernel(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(),
+ _forget_gate_out5(), _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(),
+ _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false),
+ _is_prepared(false)
{
}
@@ -104,7 +105,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
std::vector<const ICLTensor *> inputs_vector;
inputs_vector.emplace_back(input);
inputs_vector.emplace_back(output_state_in);
- const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
_forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type()));
_memory_group.manage(&_forget_gate_out2);
@@ -114,7 +115,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
weights_vector.emplace_back(input_to_forget_weights);
weights_vector.emplace_back(recurrent_to_forget_weights);
- const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(weights_vector);
+ const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(weights_vector, 0);
_forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6);
@@ -155,6 +156,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
{
_memory_group.manage(&_input_gate_out1);
_ones.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type()));
+ _ones_memset_kernel.configure(&_ones, PixelValue(1, _ones.info()->data_type()));
_subtract_input_gate.configure(ArithmeticOperation::SUB, &_ones, forget_gate_out, &_input_gate_out1, ConvertPolicy::SATURATE);
_ones.allocator()->allocate();
_run_cifg_opt = true;
@@ -167,7 +169,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
std::vector<const ICLTensor *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(lstm_weights);
+ TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
_input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2);
@@ -237,7 +239,7 @@ void CLLSTMLayer::configure(const ICLTensor *input,
std::vector<const ICLTensor *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(in_out_weights);
+ TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
_output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type()));
_concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2);
@@ -392,7 +394,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> inputs_vector;
inputs_vector.emplace_back(input);
inputs_vector.emplace_back(output_state_in);
- const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector);
+ const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(inputs_vector, 0);
TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat));
@@ -417,7 +419,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> lstm_weights;
lstm_weights.emplace_back(lstm_params.input_to_input_weights());
lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights());
- TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(lstm_weights);
+ TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(lstm_weights, 0);
TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat));
@@ -454,7 +456,7 @@ Status CLLSTMLayer::validate(const ITensorInfo *input,
std::vector<const ITensorInfo *> in_out_weights;
in_out_weights.emplace_back(input_to_output_weights);
in_out_weights.emplace_back(recurrent_to_output_weights);
- TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(in_out_weights);
+ TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(in_out_weights, 0);
TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type());
ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat));
// Validate output gate tmp
@@ -518,16 +520,7 @@ void CLLSTMLayer::run()
if(_run_cifg_opt)
{
- _ones.map(true);
- if(_ones.info()->data_type() == DataType::F16)
- {
- std::fill_n(reinterpret_cast<half *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
- }
- else
- {
- std::fill_n(reinterpret_cast<float *>(_ones.buffer()), _ones.info()->total_size() / _ones.info()->element_size(), 1);
- }
- _ones.unmap();
+ CLScheduler::get().enqueue(_ones_memset_kernel);
CLScheduler::get().enqueue(_subtract_input_gate);
}
else