From cafec8f19ee126b72ba2b0194bd25a5a93727980 Mon Sep 17 00:00:00 2001 From: John Kesapides Date: Tue, 19 Feb 2019 15:53:59 +0000 Subject: COMPMID-1024 Investigate concatenation for RNN/LSTM OpenCL Change-Id: I0cee0853b82a7e4c487989d4a0890d58ec086045 Signed-off-by: John Kesapides Reviewed-on: https://review.mlplatform.org/c/763 Comments-Addressed: Michele Di Giorgio Tested-by: Arm Jenkins Reviewed-by: Georgios Pinitas Comments-Addressed: Arm Jenkins --- arm_compute/core/utils/misc/ShapeCalculator.h | 5 +- arm_compute/runtime/CL/functions/CLLSTMLayer.h | 14 +- src/runtime/CL/functions/CLLSTMLayer.cpp | 177 ++++++++++++++++--------- 3 files changed, 124 insertions(+), 72 deletions(-) diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h index 7a34b43028..0d07266403 100644 --- a/arm_compute/core/utils/misc/ShapeCalculator.h +++ b/arm_compute/core/utils/misc/ShapeCalculator.h @@ -1162,12 +1162,11 @@ inline TensorShape extract_shape(T *data) return data->info()->tensor_shape(); } -inline TensorShape extract_shape(const ITensorInfo *data) +inline TensorShape extract_shape(ITensorInfo *data) { return data->tensor_shape(); } - -inline TensorShape extract_shape(ITensorInfo *data) +inline TensorShape extract_shape(const ITensorInfo *data) { return data->tensor_shape(); } diff --git a/arm_compute/runtime/CL/functions/CLLSTMLayer.h b/arm_compute/runtime/CL/functions/CLLSTMLayer.h index 87fb1190b7..a804a4af5b 100644 --- a/arm_compute/runtime/CL/functions/CLLSTMLayer.h +++ b/arm_compute/runtime/CL/functions/CLLSTMLayer.h @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -30,6 +30,7 @@ #include "arm_compute/core/CL/kernels/CLCopyKernel.h" #include "arm_compute/core/CL/kernels/CLElementwiseOperationKernel.h" #include "arm_compute/core/CL/kernels/CLPixelWiseMultiplicationKernel.h" +#include "arm_compute/core/CL/kernels/CLWidthConcatenate2TensorsKernel.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLMemoryGroup.h" #include "arm_compute/runtime/CL/CLTensor.h" @@ -138,6 +139,7 @@ public: // Inherited methods overridden: void run() override; + void prepare() override; private: CLMemoryGroup _memory_group; @@ -182,16 +184,20 @@ private: CLCopyKernel _copy_cell_state; CLCopyKernel _copy_output; CLWidthConcatenateLayer _concat_scratch_buffer; + CLWidthConcatenate2TensorsKernel _concat_inputs_forget_gate; + CLWidthConcatenate2TensorsKernel _concat_weights_forget_gate; + CLWidthConcatenate2TensorsKernel _concat_weights_input_gate; + CLWidthConcatenate2TensorsKernel _concat_weights_output; CLTensor _input_gate_out1; CLTensor _input_gate_out2; CLTensor _input_gate_out3; CLTensor _input_gate_out4; - CLTensor _input_gate_out5; CLTensor _forget_gate_out1; CLTensor _forget_gate_out2; CLTensor _forget_gate_out3; CLTensor _forget_gate_out4; CLTensor _forget_gate_out5; + CLTensor _forget_gate_out6; CLTensor _cell_state_out1; CLTensor _cell_state_out2; CLTensor _cell_state_out3; @@ -201,7 +207,6 @@ private: CLTensor _output2; CLTensor _output3; CLTensor _output4; - CLTensor _output5; CLTensor _cell_state_activation; CLTensor _output_state1; CLTensor _ones; @@ -210,6 +215,7 @@ private: bool _perform_cell_clipping; bool _has_projection_weights; bool _perform_projection_clipping; + bool _is_prepared; }; -} +} // namespace arm_compute #endif /* __ARM_COMPUTE_CLLSTMLAYER_H__ */ diff --git a/src/runtime/CL/functions/CLLSTMLayer.cpp b/src/runtime/CL/functions/CLLSTMLayer.cpp index f01b1b898f..a004762a4e 100644 --- a/src/runtime/CL/functions/CLLSTMLayer.cpp +++ b/src/runtime/CL/functions/CLLSTMLayer.cpp @@ -43,10 +43,10 @@ CLLSTMLayer::CLLSTMLayer(std::shared_ptr memory_manager) _pixelwise_mul_forget_gate(), _activation_forget_gate(), _fully_connected_cell_state(), _gemm_cell_state1(), _gemm_cell_state2(), _transpose_cell_state(), _accum_cell_state1(), _accum_cell_state2(), _pixelwise_mul_cell_state1(), _activation_cell_state(), _cell_clip(), _pixelwise_mul_cell_state2(), _fully_connected_output(), _gemm_output(), _pixelwise_mul_output_state1(), _transpose_output(), _accum_output1(), _accum_output2(), _activation_output(), _activation_output_state(), _pixelwise_mul_output_state2(), _fully_connected_output_state(), _gemm_output_state(), _accum_output_state(), - _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _input_gate_out5(), - _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), - _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _output5(), _cell_state_activation(), _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), - _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false) + _projection_clip(), _copy_cell_state(), _copy_output(), _concat_scratch_buffer(), _concat_inputs_forget_gate(), _concat_weights_forget_gate(), _concat_weights_input_gate(), _concat_weights_output(), + _input_gate_out1(), _input_gate_out2(), _input_gate_out3(), _input_gate_out4(), _forget_gate_out1(), _forget_gate_out2(), _forget_gate_out3(), _forget_gate_out4(), _forget_gate_out5(), + _forget_gate_out6(), _cell_state_out1(), _cell_state_out2(), _cell_state_out3(), _cell_state_out4(), _cell_state_out5(), _output1(), _output2(), _output3(), _output4(), _cell_state_activation(), + _output_state1(), _ones(), _run_peephole_opt(false), _run_cifg_opt(false), _perform_cell_clipping(false), _has_projection_weights(false), _perform_projection_clipping(false), _is_prepared(false) { } @@ -93,25 +93,38 @@ void CLLSTMLayer::configure(const ICLTensor *input, lstm_params_info, activation_info, cell_threshold, projection_threshold)); const TensorShape cell_state_shape = cell_state_in->info()->tensor_shape(); - // Configure block that calculates the forget gate // forget_gate = Activation(input * input_to_forget_weights + output_state_in * recurrent_to_forget_weights + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias) - TensorShape forget_gate1_shape = compute_transposed_shape(*recurrent_to_output_weights->info()); + // We optimize this as follows: + // forget_gate = Activation( (input,output_state_in) * (input_to_forget_weights,recurrent_to_forget_weights) + PixelWiseMul(cell_state, cell_to_forget_weights) + forget_gate_bias _forget_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _forget_gate_out2.allocator()->init(TensorInfo(forget_gate1_shape, 1, input->info()->data_type())); _forget_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _forget_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _memory_group.manage(&_forget_gate_out1); - _fully_connected_forget_gate.configure(input, input_to_forget_weights, forget_gate_bias, &_forget_gate_out1); + std::vector inputs_vector; + inputs_vector.emplace_back(input); + inputs_vector.emplace_back(output_state_in); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector); + _forget_gate_out2.allocator()->init(TensorInfo(concat_shape, 1, input->info()->data_type())); + _memory_group.manage(&_forget_gate_out2); - _transpose_forget_gate.configure(recurrent_to_forget_weights, &_forget_gate_out2); - _memory_group.manage(&_forget_gate_out3); - _gemm_forget_gate.configure(output_state_in, &_forget_gate_out2, nullptr, &_forget_gate_out3, 1.f, 0.f); - _forget_gate_out2.allocator()->allocate(); + _concat_inputs_forget_gate.configure(input, output_state_in, &_forget_gate_out2); + + std::vector weights_vector; + + weights_vector.emplace_back(input_to_forget_weights); + weights_vector.emplace_back(recurrent_to_forget_weights); + const TensorShape weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(weights_vector); + _forget_gate_out6.allocator()->init(TensorInfo(weights_concat_shape, 1, input->info()->data_type())); + + _concat_weights_forget_gate.configure(input_to_forget_weights, recurrent_to_forget_weights, &_forget_gate_out6); + _memory_group.manage(&_forget_gate_out5); - _accum_forget_gate1.configure(ArithmeticOperation::ADD, &_forget_gate_out1, &_forget_gate_out3, &_forget_gate_out5, ConvertPolicy::SATURATE); - _forget_gate_out1.allocator()->allocate(); + _fully_connected_forget_gate.configure(&_forget_gate_out2, &_forget_gate_out6, forget_gate_bias, &_forget_gate_out5); + _memory_group.manage(&_forget_gate_out1); + _memory_group.manage(&_forget_gate_out3); + _forget_gate_out6.allocator()->allocate(); + CLTensor *forget_gate_out = &_forget_gate_out5; if(lstm_params.has_peephole_opt()) { @@ -134,6 +147,8 @@ void CLLSTMLayer::configure(const ICLTensor *input, // Configure block that calculates the input gate // input_gate = Activation(input * input_to_input_weights + output_state * recurrent_to_input_weights + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG // input_gate = 1 - forget_gate, with CIFG + // We optimize this as follows: + // input_gate = Activation((input,output_state) * (input_to_input_weights,recurrent_to_input_weights) + PixelWiseMul(cell_state, cell_to_input_weights) + input_gate_bias), without CIFG _input_gate_out1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); CLTensor *input_gate_out = &_input_gate_out1; if(lstm_params.has_cifg_opt()) @@ -146,31 +161,31 @@ void CLLSTMLayer::configure(const ICLTensor *input, } else { - TensorShape input_gate_shape = compute_transposed_shape(*recurrent_to_output_weights->info()); - - _input_gate_out2.allocator()->init(TensorInfo(input_gate_shape, 1, input->info()->data_type())); _input_gate_out3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); _input_gate_out4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _input_gate_out5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); + + std::vector lstm_weights; + lstm_weights.emplace_back(lstm_params.input_to_input_weights()); + lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); + TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(lstm_weights); + _input_gate_out2.allocator()->init(TensorInfo(lstm_weights_concat_shape, 1, input->info()->data_type())); + + _concat_weights_input_gate.configure(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &_input_gate_out2); _memory_group.manage(&_input_gate_out1); - _fully_connected_input_gate.configure(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &_input_gate_out1); - _memory_group.manage(&_input_gate_out2); - _transpose_input_gate.configure(lstm_params.recurrent_to_input_weights(), &_input_gate_out2); + _memory_group.manage(&_input_gate_out3); - _gemm_input_gate.configure(output_state_in, &_input_gate_out2, nullptr, &_input_gate_out3, 1.f, 0.f); + _fully_connected_input_gate.configure(&_forget_gate_out2, &_input_gate_out2, lstm_params.input_gate_bias(), &_input_gate_out3); _input_gate_out2.allocator()->allocate(); - _memory_group.manage(&_input_gate_out4); - _accum_input_gate1.configure(ArithmeticOperation::ADD, &_input_gate_out1, &_input_gate_out3, &_input_gate_out4, ConvertPolicy::SATURATE); - _input_gate_out3.allocator()->allocate(); - input_gate_out = &_input_gate_out4; + + input_gate_out = &_input_gate_out3; if(_run_peephole_opt) { - _memory_group.manage(&_input_gate_out5); - _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out5, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_input_gate2.configure(&_input_gate_out4, &_input_gate_out5, &_input_gate_out1, ConvertPolicy::SATURATE); + _memory_group.manage(&_input_gate_out4); + _pixelwise_mul_input_gate.configure(cell_state_in, lstm_params.cell_to_input_weights(), &_input_gate_out4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_input_gate2.configure(&_input_gate_out3, &_input_gate_out4, &_input_gate_out1, ConvertPolicy::SATURATE); + _input_gate_out3.allocator()->allocate(); _input_gate_out4.allocator()->allocate(); - _input_gate_out5.allocator()->allocate(); input_gate_out = &_input_gate_out1; } else @@ -215,35 +230,39 @@ void CLLSTMLayer::configure(const ICLTensor *input, // Configure block that calculates the output // output_state_out = Activation(input * input_to_output_weights + output_state_in * recurrent_to_output_weights + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias) - TensorShape output1_shape = compute_transposed_shape(*recurrent_to_output_weights->info()); + // We optimize this as follows: + // output_state_out = Activation( (input,output_state_in) * (input_to_output_weights, recurrent_to_output_weights) + PixelWiseMul(cell_state, cell_to_output_weights) + output_gate_bias) _output1.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _output2.allocator()->init(TensorInfo(output1_shape, 1, input->info()->data_type())); - _output3.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); - _output5.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); + _output4.allocator()->init(TensorInfo(cell_state_shape, 1, input->info()->data_type())); + std::vector in_out_weights; + in_out_weights.emplace_back(input_to_output_weights); + in_out_weights.emplace_back(recurrent_to_output_weights); + TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(in_out_weights); + _output2.allocator()->init(TensorInfo(in_out_weights_concat_shape, 1, input->info()->data_type())); + + _concat_weights_output.configure(input_to_output_weights, recurrent_to_output_weights, &_output2); _memory_group.manage(&_output1); - _fully_connected_output.configure(input, input_to_output_weights, output_gate_bias, &_output1); - _memory_group.manage(&_output2); - _transpose_output.configure(recurrent_to_output_weights, &_output2); - _memory_group.manage(&_output3); - _gemm_output.configure(output_state_in, &_output2, nullptr, &_output3, 1.f, 0.f); + _memory_group.manage(&_output4); + + _fully_connected_output.configure(&_forget_gate_out2, &_output2, output_gate_bias, &_output4); + _output2.allocator()->allocate(); - _memory_group.manage(&_output5); - _accum_output1.configure(ArithmeticOperation::ADD, &_output1, &_output3, &_output5, ConvertPolicy::SATURATE); - _output3.allocator()->allocate(); - CLTensor *output_gate_out = &_output5; + _forget_gate_out2.allocator()->allocate(); + + CLTensor *output_gate_out = &_output4; if(lstm_params.has_peephole_opt()) { - _output4.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); + _output3.allocator()->init(TensorInfo(_cell_state_out1.info()->tensor_shape(), 1, input->info()->data_type())); - _memory_group.manage(&_output4); - _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output4, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); - _accum_output2.configure(&_output5, &_output4, &_output1, ConvertPolicy::SATURATE); - _output5.allocator()->allocate(); + _memory_group.manage(&_output3); + _pixelwise_mul_output_state1.configure(&_cell_state_out1, lstm_params.cell_to_output_weights(), &_output3, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN); + _accum_output2.configure(&_output4, &_output3, &_output1, ConvertPolicy::SATURATE); + _output4.allocator()->allocate(); output_gate_out = &_output1; // Allocate intermediate buffers - _output4.allocator()->allocate(); + _output3.allocator()->allocate(); } else { @@ -369,8 +388,15 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, // Validate forget gate ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_forget_weights, forget_gate_bias, &forget_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &forget_gate, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLSaturatedArithmeticOperationKernel::validate(ArithmeticOperation::ADD, &forget_gate, &forget_gate, &forget_gate, ConvertPolicy::SATURATE)); + + std::vector inputs_vector; + inputs_vector.emplace_back(input); + inputs_vector.emplace_back(output_state_in); + const TensorShape concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(inputs_vector); + TensorInfo forget_gate_concat = TensorInfo(concat_shape, 1, input->data_type()); + + ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input, output_state_in, &forget_gate_concat)); + if(lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(cell_state_in, lstm_params.cell_to_forget_weights(), &forget_gate, 1, ConvertPolicy::SATURATE, RoundingPolicy::TO_NEAREST_EVEN)); @@ -388,9 +414,15 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.recurrent_to_input_weights()->num_dimensions() > 2); ARM_COMPUTE_RETURN_ERROR_ON(lstm_params.input_gate_bias()->num_dimensions() > 1); + std::vector lstm_weights; + lstm_weights.emplace_back(lstm_params.input_to_input_weights()); + lstm_weights.emplace_back(lstm_params.recurrent_to_input_weights()); + TensorShape lstm_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(lstm_weights); + TensorInfo lstm_gate_concat = TensorInfo(lstm_weights_concat_shape, 1, input->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(lstm_params.input_to_input_weights(), lstm_params.recurrent_to_input_weights(), &lstm_gate_concat)); + ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, lstm_params.input_to_input_weights(), lstm_params.input_gate_bias(), &input_gate)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &input_gate, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&input_gate, &input_gate, &input_gate, ConvertPolicy::SATURATE)); + if(lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lstm_params.cell_to_input_weights()); @@ -419,10 +451,15 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, cell_threshold))); } + std::vector in_out_weights; + in_out_weights.emplace_back(input_to_output_weights); + in_out_weights.emplace_back(recurrent_to_output_weights); + TensorShape in_out_weights_concat_shape = arm_compute::misc::shape_calculator::calculate_width_concatenate_shape(in_out_weights); + TensorInfo in_out_gate_concat = TensorInfo(in_out_weights_concat_shape, 1, input->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(CLWidthConcatenate2TensorsKernel::validate(input_to_output_weights, recurrent_to_output_weights, &in_out_gate_concat)); // Validate output gate tmp ARM_COMPUTE_RETURN_ON_ERROR(CLFullyConnectedLayer::validate(input, input_to_output_weights, output_gate_bias, &output_gate_tmp)); - ARM_COMPUTE_RETURN_ON_ERROR(CLGEMM::validate(output_state_in, &units_out_transposed_info, nullptr, &output_gate_tmp, 1.f, 0.f, GEMMInfo())); - ARM_COMPUTE_RETURN_ON_ERROR(CLArithmeticAddition::validate(&output_gate_tmp, &output_gate_tmp, &output_gate_tmp, ConvertPolicy::SATURATE)); + if(lstm_params.has_peephole_opt()) { ARM_COMPUTE_RETURN_ON_ERROR(CLPixelWiseMultiplicationKernel::validate(&cell_state_tmp, lstm_params.cell_to_output_weights(), &output_gate_tmp, 1, ConvertPolicy::SATURATE, @@ -464,12 +501,13 @@ Status CLLSTMLayer::validate(const ITensorInfo *input, void CLLSTMLayer::run() { + prepare(); + _memory_group.acquire(); + CLScheduler::get().enqueue(_concat_inputs_forget_gate); + _fully_connected_forget_gate.run(); - CLScheduler::get().enqueue(_transpose_forget_gate); - _gemm_forget_gate.run(); - CLScheduler::get().enqueue(_accum_forget_gate1); if(_run_peephole_opt) { @@ -495,9 +533,7 @@ void CLLSTMLayer::run() else { _fully_connected_input_gate.run(); - CLScheduler::get().enqueue(_transpose_input_gate); - _gemm_input_gate.run(); - CLScheduler::get().enqueue(_accum_input_gate1); + if(_run_peephole_opt) { CLScheduler::get().enqueue(_pixelwise_mul_input_gate); @@ -521,9 +557,6 @@ void CLLSTMLayer::run() } _fully_connected_output.run(); - CLScheduler::get().enqueue(_transpose_output); - _gemm_output.run(); - CLScheduler::get().enqueue(_accum_output1); if(_run_peephole_opt) { @@ -551,3 +584,17 @@ void CLLSTMLayer::run() _memory_group.release(); } + +void CLLSTMLayer::prepare() +{ + if(!_is_prepared) + { + CLScheduler::get().enqueue(_concat_weights_forget_gate); + if(!_run_cifg_opt) + { + CLScheduler::get().enqueue(_concat_weights_input_gate); + } + CLScheduler::get().enqueue(_concat_weights_output); + _is_prepared = true; + } +} -- cgit v1.2.1