ArmNN
 21.11
RefQLstmWorkload.cpp
Go to the documentation of this file.
1 //
2 // Copyright © 2020 Arm Ltd. All rights reserved.
3 // SPDX-License-Identifier: MIT
4 //
5 
6 #include "RefQLstmWorkload.hpp"
7 #include "Activation.hpp"
8 #include "Encoders.hpp"
9 #include "Decoders.hpp"
10 #include "LstmUtils.hpp"
11 #include "RefWorkloadUtils.hpp"
12 
13 namespace armnn
14 {
15 
17  : BaseWorkload<QLstmQueueDescriptor>(descriptor, info)
18  , m_InputToInputWeightsTensor (AssignScopedTensorHandle(descriptor.m_InputToInputWeights))
19  , m_InputToForgetWeightsTensor (AssignScopedTensorHandle(descriptor.m_InputToForgetWeights))
20  , m_InputToCellWeightsTensor (AssignScopedTensorHandle(descriptor.m_InputToCellWeights))
21  , m_InputToOutputWeightsTensor (AssignScopedTensorHandle(descriptor.m_InputToOutputWeights))
22 
23  , m_RecurrentToInputWeightsTensor (AssignScopedTensorHandle(descriptor.m_RecurrentToInputWeights))
24  , m_RecurrentToForgetWeightsTensor(AssignScopedTensorHandle(descriptor.m_RecurrentToForgetWeights))
25  , m_RecurrentToCellWeightsTensor (AssignScopedTensorHandle(descriptor.m_RecurrentToCellWeights))
26  , m_RecurrentToOutputWeightsTensor(AssignScopedTensorHandle(descriptor.m_RecurrentToOutputWeights))
27 
28  , m_CellToInputWeightsTensor (AssignScopedTensorHandle(descriptor.m_CellToInputWeights))
29  , m_CellToForgetWeightsTensor (AssignScopedTensorHandle(descriptor.m_CellToForgetWeights))
30  , m_CellToOutputWeightsTensor (AssignScopedTensorHandle(descriptor.m_CellToOutputWeights))
31 
32  , m_InputGateBiasTensor (AssignScopedTensorHandle(descriptor.m_InputGateBias))
33  , m_ForgetGateBiasTensor (AssignScopedTensorHandle(descriptor.m_ForgetGateBias))
34  , m_CellBiasTensor (AssignScopedTensorHandle(descriptor.m_CellBias))
35  , m_OutputGateBiasTensor (AssignScopedTensorHandle(descriptor.m_OutputGateBias))
36 
37  , m_ProjectionWeightsTensor (AssignScopedTensorHandle(descriptor.m_ProjectionWeights))
38  , m_ProjectionBiasTensor (AssignScopedTensorHandle(descriptor.m_ProjectionBias))
39 
40  , m_InputLayerNormWeightsTensor (AssignScopedTensorHandle(descriptor.m_InputLayerNormWeights))
41  , m_ForgetLayerNormWeightsTensor (AssignScopedTensorHandle(descriptor.m_ForgetLayerNormWeights))
42  , m_CellLayerNormWeightsTensor (AssignScopedTensorHandle(descriptor.m_CellLayerNormWeights))
43  , m_OutputLayerNormWeightsTensor (AssignScopedTensorHandle(descriptor.m_OutputLayerNormWeights))
44 {}
45 
47 {
49 }
50 
52 {
53  Execute(workingMemDescriptor.m_Inputs, workingMemDescriptor.m_Outputs);
54 }
55 
56 void RefQLstmWorkload::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs) const
57 {
58  // This is a porting of the QLSTM::Execute(std::vector<ITensorHandle*> inputs, std::vector<ITensorHandle*> outputs)
59  // method in the Android code base
60  // Note: this implementation wraps the arithmetic functions of the LSTM cell in Quantize/Dequantize ops, so all
61  // computation is done in the floating point domain. Arithmetic functions are found in LstmUtils.cpp.
62  // Refer to: android/frameworks/ml/nn/common/operations/QLSTM.cpp
63  const DataType& internalType = armnn::DataType::QSymmS16;
64 
65  const TensorInfo& inputInfo = GetTensorInfo(inputs[0]);
66  const TensorInfo& outputStateInInfo = GetTensorInfo(inputs[1]);
67  const TensorInfo& cellStateInInfo = GetTensorInfo(inputs[2]);
68 
69  const TensorInfo& outputStateOutInfo = GetTensorInfo(outputs[0]);
70  const TensorInfo& cellStateOutInfo = GetTensorInfo(outputs[1]);
71  const TensorInfo& outputInfo = GetTensorInfo(outputs[2]);
72 
73  const TensorShape& inputShape = inputInfo.GetShape();
74  const TensorShape& outputStateInShape = outputStateInInfo.GetShape();
75  const TensorShape& cellStateInShape = cellStateInInfo.GetShape();
76 
77  // Infer numBatches, inputSize, outputSize and numUnits
78  const uint32_t numBatches = inputShape[0];
79  const uint32_t inputSize = inputShape[1];
80  const uint32_t outputSize = outputStateInShape[1];
81  const uint32_t numUnits = cellStateInShape[1];
82 
83  // Optional param settings
84  const bool cifgEnabled = m_Data.m_Parameters.m_CifgEnabled;
85  const bool peepholeEnabled = m_Data.m_Parameters.m_PeepholeEnabled;
86  const bool projectionEnabled = m_Data.m_Parameters.m_ProjectionEnabled;
87  const bool layerNormEnabled = m_Data.m_Parameters.m_LayerNormEnabled;
88 
89  // Input decoders
90  std::unique_ptr<Decoder<float>> inputDecoder =
91  MakeDecoder<float>(inputInfo, inputs[0]->Map());
92  std::unique_ptr<Decoder<float>> outputStateInDecoder =
93  MakeDecoder<float>(outputStateInInfo, inputs[1]->Map());
94  std::unique_ptr<Decoder<float>> cellStateInDecoder =
95  MakeDecoder<float>(cellStateInInfo, inputs[2]->Map());
96 
97  // Output decoders
98  std::unique_ptr<Decoder<float>> outputStateOutDecoder =
99  MakeDecoder<float>(outputStateOutInfo, outputs[0]->Map());
100  std::unique_ptr<Decoder<float>> cellStateOutDecoder =
101  MakeDecoder<float>(cellStateOutInfo, outputs[1]->Map());
102  std::unique_ptr<Decoder<float>> outputDecoder =
103  MakeDecoder<float>(outputInfo, outputs[2]->Map());
104 
105  // Output encoders
106  std::unique_ptr<Encoder<float>> outputStateOutEncoder =
107  MakeEncoder<float>(outputStateOutInfo, outputs[0]->Map());
108  std::unique_ptr<Encoder<float>> cellStateOutEncoder =
109  MakeEncoder<float>(cellStateOutInfo, outputs[1]->Map());
110  std::unique_ptr<Encoder<float>> outputEncoder =
111  MakeEncoder<float>(outputInfo, outputs[2]->Map());
112 
113  // Weights decoders
114  std::unique_ptr<Decoder<float>> inputToForgetWeightsDecoder = MakeDecoder<float>(
115  m_InputToForgetWeightsTensor->GetTensorInfo(), m_InputToForgetWeightsTensor->GetConstTensor<void>());
116  std::unique_ptr<Decoder<float>> inputToCellWeightsDecoder = MakeDecoder<float>(
117  m_InputToCellWeightsTensor->GetTensorInfo(), m_InputToCellWeightsTensor->GetConstTensor<void>());
118  std::unique_ptr<Decoder<float>> inputToOutputWeightsDecoder = MakeDecoder<float>(
119  m_InputToOutputWeightsTensor->GetTensorInfo(), m_InputToOutputWeightsTensor->GetConstTensor<void>());
120 
121  std::unique_ptr<Decoder<float>> recurrentToForgetWeightsDecoder = MakeDecoder<float>(
122  m_RecurrentToForgetWeightsTensor->GetTensorInfo(),
123  m_RecurrentToForgetWeightsTensor->GetConstTensor<void>());
124  std::unique_ptr<Decoder<float>> recurrentToCellWeightsDecoder = MakeDecoder<float>(
125  m_RecurrentToCellWeightsTensor->GetTensorInfo(), m_RecurrentToCellWeightsTensor->GetConstTensor<void>());
126  std::unique_ptr<Decoder<float>> recurrentToOutputWeightsDecoder = MakeDecoder<float>(
127  m_RecurrentToOutputWeightsTensor->GetTensorInfo(),
128  m_RecurrentToOutputWeightsTensor->GetConstTensor<void>());
129 
130  // Optional CIFG params
131  std::unique_ptr<Decoder<float>> inputToInputWeightsDecoder;
132  std::unique_ptr<Decoder<float>> recurrentToInputWeightsDecoder;
133  std::unique_ptr<Decoder<float>> inputGateBiasDecoder;
134 
135  // Optional Peephole params
136  std::unique_ptr<Decoder<float>> cellToInputWeightsDecoder;
137  std::unique_ptr<Decoder<float>> cellToForgetWeightsDecoder;
138  std::unique_ptr<Decoder<float>> cellToOutputWeightsDecoder;
139 
140  // Optional Projection params
141  std::unique_ptr<Decoder<float>> projectionWeightsDecoder;
142  std::unique_ptr<Decoder<float>> projectionBiasDecoder;
143 
144  // Optional Layer Norm params
145  std::unique_ptr<Decoder<float>> inputLayerNormWeightsDecoder;
146  std::unique_ptr<Decoder<float>> forgetLayerNormWeightsDecoder;
147  std::unique_ptr<Decoder<float>> cellLayerNormWeightsDecoder;
148  std::unique_ptr<Decoder<float>> outputLayerNormWeightsDecoder;
149 
150  // Biases are only used when Layer Norm is enabled. Scale is defined as (XLayerNormWeights Scale / 1024)
151  std::unique_ptr<Decoder<float>> forgetGateBiasDecoder;
152  std::unique_ptr<Decoder<float>> cellGateBiasDecoder;
153  std::unique_ptr<Decoder<float>> outputGateBiasDecoder;
154 
155  // Int16 vectors for internal state data (to be decoded/encoded)
156  const uint32_t stateTensorSize = numBatches * numUnits;
157  std::vector<int16_t> inputGateData(stateTensorSize);
158  std::vector<int16_t> cellGateData(stateTensorSize);
159  std::vector<int16_t> forgetGateData(stateTensorSize);
160  std::vector<int16_t> outputGateData(stateTensorSize);
161  std::vector<int32_t> hiddenStateData(stateTensorSize);
162  std::vector<int16_t> outputInt16Data(numBatches * outputSize);
163 
164  armnn::TensorInfo inputGateInfo(
166  armnn::TensorInfo cellGateInfo(
168  armnn::TensorInfo forgetGateInfo(
170  armnn::TensorInfo outputGateInfo(
172  armnn::TensorInfo hiddenStateInfo({numBatches, numUnits},
176  armnn::TensorInfo outputInt16Info({numBatches , outputSize},
178  outputInfo.GetQuantizationScale(),
179  outputInfo.GetQuantizationOffset());
180 
181  // Decoders/Encoders for internal states
182  std::unique_ptr<Decoder<float>> inputGateDecoder =
183  MakeDecoder<float>(inputGateInfo, inputGateData.data());
184  std::unique_ptr<Decoder<float>> cellGateDecoder =
185  MakeDecoder<float>(cellGateInfo, cellGateData.data());
186  std::unique_ptr<Decoder<float>> forgetGateDecoder =
187  MakeDecoder<float>(forgetGateInfo, forgetGateData.data());
188  std::unique_ptr<Decoder<float>> outputGateDecoder =
189  MakeDecoder<float>(outputGateInfo, outputGateData.data());
190  std::unique_ptr<Decoder<float>> hiddenStateDecoder =
191  MakeDecoder<float>(hiddenStateInfo, hiddenStateData.data());
192 
193  std::unique_ptr<Encoder<float>> inputGateEncoder =
194  MakeEncoder<float>(inputGateInfo, inputGateData.data());
195  std::unique_ptr<Encoder<float>> cellGateEncoder =
196  MakeEncoder<float>(cellGateInfo, cellGateData.data());
197  std::unique_ptr<Encoder<float>> forgetGateEncoder =
198  MakeEncoder<float>(forgetGateInfo, forgetGateData.data());
199  std::unique_ptr<Encoder<float>> outputGateEncoder =
200  MakeEncoder<float>(outputGateInfo, outputGateData.data());
201  std::unique_ptr<Encoder<float>> hiddenStateEncoder =
202  MakeEncoder<float>(hiddenStateInfo, hiddenStateData.data());
203 
204  // Int16 used to accumulate output to prevent overflowing (after Projection MatMul)
205  std::unique_ptr<Decoder<float>> outputInt16Decoder =
206  MakeDecoder<float>(outputInt16Info, outputInt16Data.data());
207  std::unique_ptr<Encoder<float>> outputInt16Encoder =
208  MakeEncoder<float>(outputInt16Info, outputInt16Data.data());
209 
210  // Create decoders for optional params if they are enabled
211  if (!cifgEnabled)
212  {
213  inputToInputWeightsDecoder = MakeDecoder<float>(
214  m_InputToInputWeightsTensor->GetTensorInfo(), m_InputToInputWeightsTensor->GetConstTensor<void>());
215  recurrentToInputWeightsDecoder = MakeDecoder<float>(m_RecurrentToInputWeightsTensor->GetTensorInfo(),
216  m_RecurrentToInputWeightsTensor->GetConstTensor<void>());
217  }
218 
219  if (peepholeEnabled)
220  {
221  if (!cifgEnabled)
222  {
223  cellToInputWeightsDecoder = MakeDecoder<float>(
224  m_CellToInputWeightsTensor->GetTensorInfo(), m_CellToInputWeightsTensor->GetConstTensor<void>());
225  }
226  cellToForgetWeightsDecoder = MakeDecoder<float>(
227  m_CellToForgetWeightsTensor->GetTensorInfo(), m_CellToForgetWeightsTensor->GetConstTensor<void>());
228  cellToOutputWeightsDecoder = MakeDecoder<float>(
229  m_CellToOutputWeightsTensor->GetTensorInfo(), m_CellToOutputWeightsTensor->GetConstTensor<void>());
230  }
231 
232  if (projectionEnabled)
233  {
234  projectionWeightsDecoder = MakeDecoder<float>(
235  m_ProjectionWeightsTensor->GetTensorInfo(), m_ProjectionWeightsTensor->GetConstTensor<void>());
236  if (m_ProjectionBiasTensor)
237  {
238  projectionBiasDecoder = MakeDecoder<float>(
239  m_ProjectionBiasTensor->GetTensorInfo(), m_ProjectionBiasTensor->GetConstTensor<void>());
240  }
241  }
242 
243  if (layerNormEnabled)
244  {
245  if (!cifgEnabled)
246  {
247  inputLayerNormWeightsDecoder = MakeDecoder<float>(m_InputLayerNormWeightsTensor->GetTensorInfo(),
248  m_InputLayerNormWeightsTensor->GetConstTensor<void>());
249 
250  // Bias only used if layer norm enabled
251  armnn::TensorInfo inputGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,
252  m_InputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);
253  inputGateBiasDecoder = MakeDecoder<float>(
254  inputGateBiasTensorInfo, m_InputGateBiasTensor->GetConstTensor<void>());
255  }
256 
257  forgetLayerNormWeightsDecoder = MakeDecoder<float>(
258  m_ForgetLayerNormWeightsTensor->GetTensorInfo(),
259  m_ForgetLayerNormWeightsTensor->GetConstTensor<void>());
260  cellLayerNormWeightsDecoder = MakeDecoder<float>(
261  m_CellLayerNormWeightsTensor->GetTensorInfo(), m_CellLayerNormWeightsTensor->GetConstTensor<void>());
262  outputLayerNormWeightsDecoder = MakeDecoder<float>(
263  m_OutputLayerNormWeightsTensor->GetTensorInfo(),
264  m_OutputLayerNormWeightsTensor->GetConstTensor<void>());
265 
266  // Bias only used if layer norm enabled
267  armnn::TensorInfo forgetGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,
268  m_ForgetLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);
269  forgetGateBiasDecoder = MakeDecoder<float>(
270  forgetGateBiasTensorInfo, m_ForgetGateBiasTensor->GetConstTensor<void>());
271 
272  armnn::TensorInfo cellGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,
273  m_CellLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);
274  cellGateBiasDecoder = MakeDecoder<float>(
275  cellGateBiasTensorInfo, m_CellBiasTensor->GetConstTensor<void>());
276 
277  armnn::TensorInfo outputGateBiasTensorInfo({outputSize}, armnn::DataType::Signed32,
278  m_OutputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() / 1024, 0);
279  outputGateBiasDecoder = MakeDecoder<float>(
280  outputGateBiasTensorInfo, m_OutputGateBiasTensor->GetConstTensor<void>());
281  }
282 
283  // Initialize internal state tensors with zeroes.
284  if (!cifgEnabled)
285  {
286  ZeroVector(*inputGateEncoder, stateTensorSize);
287  }
288  ZeroVector(*forgetGateEncoder, stateTensorSize);
289  ZeroVector(*cellGateEncoder, stateTensorSize);
290  ZeroVector(*outputGateEncoder, stateTensorSize);
291  ZeroVector(*hiddenStateEncoder, stateTensorSize);
292 
293  // Input weights * Input
294  if (!cifgEnabled)
295  {
296  MatrixBatchVectorMultiplyAccumulate(*inputToInputWeightsDecoder,
297  numUnits, inputSize, *inputDecoder, numBatches, *inputGateEncoder);
298  }
299 
300  MatrixBatchVectorMultiplyAccumulate(*inputToForgetWeightsDecoder,
301  numUnits, inputSize, *inputDecoder, numBatches, *forgetGateEncoder);
302 
303  MatrixBatchVectorMultiplyAccumulate(*inputToCellWeightsDecoder,
304  numUnits, inputSize, *inputDecoder, numBatches, *cellGateEncoder);
305 
306  MatrixBatchVectorMultiplyAccumulate(*inputToOutputWeightsDecoder,
307  numUnits, inputSize, *inputDecoder, numBatches, *outputGateEncoder);
308 
309  // Recurrent weights * OutputStateIn
310  if (!cifgEnabled)
311  {
312  MatrixBatchVectorMultiplyAccumulate(*recurrentToInputWeightsDecoder,
313  numUnits, outputSize, *outputStateInDecoder, numBatches, *inputGateEncoder);
314  }
315 
316  MatrixBatchVectorMultiplyAccumulate(*recurrentToForgetWeightsDecoder,
317  numUnits, outputSize, *outputStateInDecoder, numBatches, *forgetGateEncoder);
318 
319  MatrixBatchVectorMultiplyAccumulate(*recurrentToCellWeightsDecoder,
320  numUnits, outputSize, *outputStateInDecoder, numBatches, *cellGateEncoder);
321 
322  MatrixBatchVectorMultiplyAccumulate(*recurrentToOutputWeightsDecoder,
323  numUnits, outputSize, *outputStateInDecoder, numBatches, *outputGateEncoder);
324 
325  // Input gate.
326  if (!cifgEnabled)
327  {
328  if (peepholeEnabled)
329  {
330  VectorBatchVectorCwiseProductAccumulate(*cellToInputWeightsDecoder,
331  numUnits, *cellStateInDecoder, numBatches, *inputGateEncoder);
332  }
333 
334  if (layerNormEnabled)
335  {
336  inputGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *
337  m_InputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *
338  1024);
339  inputGateEncoder = MakeEncoder<float>(inputGateInfo, inputGateData.data());
340 
341  MeanStddevNormalization(*inputGateDecoder,
342  *inputGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);
343 
344  inputGateDecoder = MakeDecoder<float>(inputGateInfo, inputGateData.data());
345 
346  VectorBatchVectorCwiseProduct(*inputLayerNormWeightsDecoder,
347  numUnits, *inputGateDecoder, numBatches, *inputGateEncoder);
348 
349  inputGateInfo.SetQuantizationScale(1.f / 4096);
350  inputGateEncoder = MakeEncoder<float>(inputGateInfo, inputGateData.data());
351 
352  VectorBatchVectorAdd(*inputGateBiasDecoder,
353  numUnits, *inputGateDecoder, numBatches, *inputGateEncoder);
354 
355  inputGateDecoder = MakeDecoder<float>(inputGateInfo, inputGateData.data());
356  }
357 
358  inputGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());
359  inputGateEncoder = MakeEncoder<float>(inputGateInfo, inputGateData.data());
360 
361  // Input gate sigmoid
362  Activation(*inputGateDecoder, *inputGateEncoder,
363  TensorInfo({numUnits, numBatches}, internalType),
365 
366  inputGateDecoder = MakeDecoder<float>(inputGateInfo, inputGateData.data());
367  }
368 
369  // Forget gate
370  if (peepholeEnabled)
371  {
372  VectorBatchVectorCwiseProductAccumulate(*cellToForgetWeightsDecoder, numUnits,
373  *cellStateInDecoder, numBatches, *forgetGateEncoder);
374  }
375 
376  if (layerNormEnabled)
377  {
378  // Quantize layer norm output to Input Scale * m_ForgetLayerNormWeightsTensor * 1024
379  forgetGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *
380  m_ForgetLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *
381  1024);
382  forgetGateEncoder = MakeEncoder<float>(forgetGateInfo, forgetGateData.data());
383 
384 
385 
386  MeanStddevNormalization(*forgetGateDecoder,
387  *forgetGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);
388 
389 
390  forgetGateDecoder = MakeDecoder<float>(forgetGateInfo, forgetGateData.data());
391 
392  VectorBatchVectorCwiseProduct(*forgetLayerNormWeightsDecoder,
393  numUnits, *forgetGateDecoder, numBatches, *forgetGateEncoder);
394 
395 
396  // Dequantize layer norm output to (1 / 4096)
397  forgetGateInfo.SetQuantizationScale(1.f / 4096);
398  forgetGateEncoder = MakeEncoder<float>(forgetGateInfo, forgetGateData.data());
399 
400  VectorBatchVectorAdd(*forgetGateBiasDecoder,
401  numUnits, *forgetGateDecoder, numBatches, *forgetGateEncoder);
402 
403 
404  forgetGateDecoder = MakeDecoder<float>(forgetGateInfo, forgetGateData.data());
405  }
406 
407  forgetGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());
408  forgetGateEncoder = MakeEncoder<float>(forgetGateInfo, forgetGateData.data());
409 
410  // Forget gate sigmoid
411  Activation(*forgetGateDecoder, *forgetGateEncoder,
412  TensorInfo({numUnits, numBatches}, internalType),
414 
415  forgetGateDecoder = MakeDecoder<float>(forgetGateInfo, forgetGateData.data());
416 
417  // Cell (Modulation) gate
418  if (layerNormEnabled)
419  {
420  cellGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *
421  m_CellLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *
422  1024);
423  cellGateEncoder = MakeEncoder<float>(cellGateInfo, cellGateData.data());
424 
425  MeanStddevNormalization(*cellGateDecoder, *cellGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);
426 
427  cellGateDecoder = MakeDecoder<float>(cellGateInfo, cellGateData.data());
428 
429  VectorBatchVectorCwiseProduct(*cellLayerNormWeightsDecoder,
430  numUnits, *cellGateDecoder, numBatches, *cellGateEncoder);
431 
432  cellGateInfo.SetQuantizationScale(1.f / 4096);
433  cellGateEncoder = MakeEncoder<float>(cellGateInfo, cellGateData.data());
434 
435  VectorBatchVectorAdd(*cellGateBiasDecoder,
436  numUnits, *cellGateDecoder, numBatches, *cellGateEncoder);
437 
438  cellGateDecoder = MakeDecoder<float>(cellGateInfo, cellGateData.data());
439  }
440 
441  cellGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());
442  cellGateEncoder = MakeEncoder<float>(cellGateInfo, cellGateData.data());
443 
444  // Cell (Modulation) gate tanH
445  Activation(*cellGateDecoder, *cellGateEncoder,
446  TensorInfo({numUnits, numBatches}, internalType),
447  ActivationFunction::TanH, 1.0f, 1.0f);
448 
449  cellGateDecoder = MakeDecoder<float>(cellGateInfo, cellGateData.data());
450 
451  VectorVectorCwiseProduct(*forgetGateDecoder, *cellStateInDecoder, stateTensorSize, *cellStateOutEncoder);
452 
453  if (cifgEnabled)
454  {
455  Sub1Vector(*forgetGateDecoder, stateTensorSize, *forgetGateEncoder);
457  *cellGateDecoder, *forgetGateDecoder, stateTensorSize, *cellStateOutEncoder);
458  }
459  else
460  {
462  *cellGateDecoder, *inputGateDecoder, stateTensorSize, *cellStateOutEncoder);
463  }
464 
465  // Final cell state out calculated here
466  if (m_Data.m_Parameters.m_CellClip > 0.0)
467  {
468  ClipVector(*cellStateOutDecoder, stateTensorSize, m_Data.m_Parameters.m_CellClip, *cellStateOutEncoder);
469  }
470 
471  // Output gate.
472  if (peepholeEnabled)
473  {
474  VectorBatchVectorCwiseProductAccumulate(*cellToOutputWeightsDecoder,
475  numUnits, *cellStateOutDecoder, numBatches, *outputGateEncoder);
476  }
477 
478  if (layerNormEnabled)
479  {
480  outputGateInfo.SetQuantizationScale(inputInfo.GetQuantizationScale() *
481  m_OutputLayerNormWeightsTensor->GetTensorInfo().GetQuantizationScale() *
482  1024);
483  outputGateEncoder = MakeEncoder<float>(outputGateInfo, outputGateData.data());
484 
485  MeanStddevNormalization(*outputGateDecoder, *outputGateEncoder, numUnits, numBatches, m_LayerNormEpsilon);
486 
487  outputGateDecoder = MakeDecoder<float>(outputGateInfo, outputGateData.data());
488 
489  VectorBatchVectorCwiseProduct(*outputLayerNormWeightsDecoder, numUnits, *outputGateDecoder,
490  numBatches, *outputGateEncoder);
491 
492  outputGateInfo.SetQuantizationScale(1.f / 4096);
493  outputGateEncoder = MakeEncoder<float>(outputGateInfo, outputGateData.data());
494 
495  VectorBatchVectorAdd(*outputGateBiasDecoder, numUnits, *outputGateDecoder, numBatches, *outputGateEncoder);
496 
497  outputGateDecoder = MakeDecoder<float>(outputGateInfo, outputGateData.data());
498  }
499 
500  outputGateInfo.SetQuantizationScale(cellStateOutInfo.GetQuantizationScale());
501  outputGateEncoder = MakeEncoder<float>(outputGateInfo, outputGateData.data());
502 
503  // Output gate sigmoid
504  Activation(*outputGateDecoder, *outputGateEncoder,
505  TensorInfo({numUnits, numBatches}, internalType),
507 
508  outputGateDecoder = MakeDecoder<float>(outputGateInfo, outputGateData.data());
509 
510  // Hidden state tanH
511  Activation(*cellStateOutDecoder, *cellGateEncoder,
512  TensorInfo({numUnits, numBatches}, internalType),
513  ActivationFunction::TanH, 1.0f, 1.0f);
514 
515  // Final hidden state output
516  VectorVectorCwiseProduct(*outputGateDecoder, *cellGateDecoder, stateTensorSize, *hiddenStateEncoder);
517 
518  // Projection
520  {
521  if (m_ProjectionBiasTensor)
522  {
523  VectorBatchVectorAssign(*projectionBiasDecoder, outputSize, numBatches, *outputInt16Encoder);
524  }
525 
526  MatrixBatchVectorMultiplyAccumulate(*projectionWeightsDecoder, outputSize, numUnits, *hiddenStateDecoder,
527  numBatches, *outputInt16Encoder);
528 
529  CopyVector(*outputInt16Decoder, numBatches * outputSize, *outputEncoder);
530 
532  {
533  ClipVector(*outputDecoder, numBatches * outputSize, m_Data.m_Parameters.m_ProjectionClip, *outputEncoder);
534  }
535  }
536  else
537  {
538  // Output has same quantization scale as hidden state if projection is disabled
539  CopyVector(*hiddenStateDecoder, numBatches * outputSize, *outputEncoder);
540  }
541 
542  // output == outputStateOut
543  CopyVector(*outputDecoder, numBatches * outputSize, *outputStateOutEncoder);
544 }
545 
546 } //namespace armnn
void MeanStddevNormalization(armnn::Decoder< float > &input_vector, armnn::Encoder< float > &output_vector, uint32_t v_size, uint32_t n_batch, float normalization_epsilon)
Definition: LstmUtils.cpp:40
void VectorBatchVectorAdd(armnn::Decoder< float > &vector, uint32_t vSize, armnn::Decoder< float > &batchVector, uint32_t nBatch, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:16
const TensorShape & GetShape() const
Definition: Tensor.hpp:191
void Execute() const override
void ClipVector(armnn::Decoder< float > &vector, uint32_t vSize, float absLimit, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:229
bool m_PeepholeEnabled
Enable/disable peephole.
void Sub1Vector(armnn::Decoder< float > &vector, uint32_t vSize, armnn::Encoder< float > &result)
Definition: LstmUtils.cpp:173
float m_HiddenStateScale
Hidden State quantization scale.
float m_OutputIntermediateScale
Output intermediate quantization scale.
void CopyVector(armnn::Decoder< float > &vector, uint32_t vSize, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:244
std::unique_ptr< armnn::ScopedTensorHandle > AssignScopedTensorHandle(const armnn::ConstTensorHandle *ptr)
Definition: LstmUtils.cpp:299
void VectorBatchVectorCwiseProductAccumulate(armnn::Decoder< float > &vector, uint32_t vSize, armnn::Decoder< float > &batchVector, uint32_t nBatch, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:131
void ZeroVector(armnn::Encoder< float > &vector, uint32_t vSize)
Definition: LstmUtils.cpp:76
Copyright (c) 2021 ARM Limited and Contributors.
void VectorVectorCwiseProduct(armnn::Decoder< float > &vector1, armnn::Decoder< float > &vector2, uint32_t vSize, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:187
void VectorBatchVectorCwiseProduct(armnn::Decoder< float > &vector, uint32_t vSize, armnn::Decoder< float > &batchVector, uint32_t nBatch, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:152
void ExecuteAsync(WorkingMemDescriptor &workingMemDescriptor) override
void MatrixBatchVectorMultiplyAccumulate(armnn::Decoder< float > &matrix, uint32_t mRows, uint32_t mCols, armnn::Decoder< float > &vector, uint32_t nBatch, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:87
bool m_LayerNormEnabled
Enable/disable layer normalization.
DataType
Definition: Types.hpp:35
RefQLstmWorkload(const QLstmQueueDescriptor &descriptor, const WorkloadInfo &info)
float m_ProjectionClip
Clipping threshold value for the projection.
float m_InputIntermediateScale
Input intermediate quantization scale.
void VectorVectorCwiseProductAccumulate(armnn::Decoder< float > &vector1, armnn::Decoder< float > &vector2, uint32_t vSize, armnn::Encoder< float > &outResult)
Definition: LstmUtils.cpp:204
void VectorBatchVectorAssign(armnn::Decoder< float > &vector, uint32_t vSize, uint32_t nBatch, armnn::Encoder< float > &outBatchVector)
Definition: LstmUtils.cpp:113
float m_ForgetIntermediateScale
Forget intermediate quantization scale.
float m_CellClip
Clipping threshold value for the cell state.
std::vector< ITensorHandle * > m_Outputs
bool m_ProjectionEnabled
Enable/disable the projection layer.
Contains information about TensorInfos of a layer.
std::vector< ITensorHandle * > m_Inputs
const TensorInfo & GetTensorInfo(const ITensorHandle *tensorHandle)
float32 helpers
float m_CellIntermediateScale
Cell intermediate quantization scale.
bool m_CifgEnabled
Enable/disable CIFG (coupled input & forget gate).
int32_t m_HiddenStateZeroPoint
Hidden State zero point.