diff options
Diffstat (limited to 'src/armnn')
370 files changed, 15696 insertions, 3521 deletions
diff --git a/src/armnn/Descriptors.cpp b/src/armnn/Descriptors.cpp index be04294e85..faf167d95f 100644 --- a/src/armnn/Descriptors.cpp +++ b/src/armnn/Descriptors.cpp @@ -157,7 +157,7 @@ const uint32_t* OriginsDescriptor::GetViewOrigin(uint32_t idx) const } -// Reorder the viewOrigins in accordance with the indices presented in newOrdering array +// Reorders the viewOrigins in accordance with the indices presented in newOrdering array. void OriginsDescriptor::ReorderOrigins(unsigned int* newOrdering, unsigned int numNewOrdering) { BOOST_ASSERT_MSG(m_NumViews == numNewOrdering, "number of views must match number of " diff --git a/src/armnn/DeviceSpec.hpp b/src/armnn/DeviceSpec.hpp new file mode 100644 index 0000000000..3706438482 --- /dev/null +++ b/src/armnn/DeviceSpec.hpp @@ -0,0 +1,22 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "armnn/Types.hpp" +#include <set> + +namespace armnn +{ + +class DeviceSpec : public IDeviceSpec +{ +public: + DeviceSpec() {} + virtual ~DeviceSpec() {} + + std::set<Compute> m_SupportedComputeDevices; +}; + +} diff --git a/src/armnn/Graph.cpp b/src/armnn/Graph.cpp index 87bdc2962f..74b30e4087 100644 --- a/src/armnn/Graph.cpp +++ b/src/armnn/Graph.cpp @@ -32,7 +32,7 @@ Graph::Graph(const Graph& other) otherToClonedMap.emplace(otherLayer, layer); } - // Copy slot connections + // Copies slot connections. for (auto&& otherLayer : other.m_Layers) { Layer* const thisLayer = otherToClonedMap[otherLayer]; @@ -95,18 +95,18 @@ Status Graph::SerializeToDot(std::ostream& stream) .AddAttribute("fontname", "arial-bold"); } - // First declare the nodes + // First declares the nodes. for (auto&& layer : m_Layers) { DotNode node(stream, layer->GetGuid(), GetLayerTypeAsCString(layer->GetType())); - // Extract the layer parameters + // Extracts the layer parameters. ParameterStringifyFunction extractParams = [&node](const std::string & name, const std::string & value){ node.GetContents().AddContent(name + " : " + value); }; layer->SerializeLayerParameters(extractParams); } - // Second declare the edges + // Second declares the edges. for (auto&& layer : m_Layers) { LayerGuid toId = layer->GetGuid(); @@ -117,9 +117,9 @@ Status Graph::SerializeToDot(std::ostream& stream) LayerGuid fromId = outputSlot->GetOwningLayer().GetGuid(); DotEdge edge(stream, fromId, toId); - // Now Print the tensor shape on the edge + // Now print the tensor shape on the edge. { - // Construct the label attribute with HTML markup + // Constructs the label attribute with HTML markup. std::stringstream ss; ss << "< " << outputSlot->GetTensorInfo().GetShape() << " >"; edge.GetAttributeSet().AddAttribute("label", ss); @@ -137,13 +137,94 @@ Status Graph::SerializeToDot(std::ostream& stream) Status Graph::AllocateDynamicBuffers() { + // Layers must be sorted in topological order + BOOST_ASSERT(m_LayersInOrder); + + std::unordered_set<const ITensorHandle*> preallocatedTensors; + std::unordered_map<const ITensorHandle*, unsigned int> handleReferenceCounts; + + // Finds the first TensorHandle ancestor of a SubTensorHandle. If the ITensorHandle provided + // is a TensorHandle, the function just returns it + auto TraceSubTensorHandleAncestry = [](ITensorHandle* const subTensorHandle) + { + ITensorHandle* ancestor = subTensorHandle; + while (ancestor && ancestor->GetParent()) + { + ancestor = ancestor->GetParent(); + } + return ancestor; + }; + + // Checks whether a TensorHandle has been pre-allocated + auto IsPreallocated = [&](ITensorHandle* const tensorHandle) + { + return tensorHandle && preallocatedTensors.find(tensorHandle) != preallocatedTensors.end(); + }; + + // Constant tensor handles need to last from the beginning of execution till the end, + // therefore we pre-allocate them upfront for (auto&& layer : m_Layers) { - for (auto slot = layer->BeginOutputSlots(); slot != layer->EndOutputSlots(); ++slot) + if (layer->GetType() == LayerType::Constant) { - slot->GetOutputHandler().AllocateTensors(); + for (auto&& slot = layer->BeginOutputSlots(); slot != layer->EndOutputSlots(); ++slot) + { + ITensorHandle *tensorHandle = TraceSubTensorHandleAncestry(slot->GetOutputHandler().GetData()); + + if (tensorHandle && !IsPreallocated(tensorHandle)) + { + tensorHandle->Allocate(); + preallocatedTensors.insert(tensorHandle); + } + } } } + + // Iterate over the network in topological order + for (auto&& layer : m_Layers) + { + // Count the amount of times each output slot references a certain buffer (ITensorHandle). + // The first time we encounter a new tensor handle, we start managing its lifetime. + for (auto&& slot = layer->BeginOutputSlots(); slot != layer->EndOutputSlots(); ++slot) + { + ITensorHandle *tensorHandle = TraceSubTensorHandleAncestry(slot->GetOutputHandler().GetData()); + + if (tensorHandle && !IsPreallocated(tensorHandle)) + { + unsigned int numConnections = slot->GetNumConnections(); + if (handleReferenceCounts.find(tensorHandle) == handleReferenceCounts.end()) + { + handleReferenceCounts[tensorHandle] = numConnections; + tensorHandle->Manage(); + } + else + { + handleReferenceCounts[tensorHandle] += numConnections; + } + } + } + + // Loop through the input slots in the same layer and decrement the reference counter associated + // to each tensor handle we encounter. Once it reaches zero, we end the lifetime of the tensor handle + for (auto&& slot = layer->BeginInputSlots(); slot != layer->EndInputSlots(); ++slot) + { + ITensorHandle *tensorHandle = TraceSubTensorHandleAncestry( + slot->GetConnectedOutputSlot()->GetOutputHandler().GetData()); + + if (tensorHandle && !IsPreallocated(tensorHandle)) + { + --handleReferenceCounts[tensorHandle]; + + if (handleReferenceCounts[tensorHandle] == 0u) + { + // Stop managing lifetime of tensor handle + tensorHandle->Allocate(); + handleReferenceCounts.erase(tensorHandle); + } + } + } + } + return Status::Success; } @@ -151,7 +232,7 @@ const Graph& Graph::TopologicalSort() const { if (!m_LayersInOrder) { - //Reset layer order + // Resets layer order. for (auto&& it : m_Layers) { it->ResetPriority(); @@ -178,9 +259,9 @@ void Graph::AddCopyLayers() // CPU -> Neon (and viceversa) auto MayNeedCopyLayer = [](const Layer& layer) { - // All layers should have been associated with a valid compute device at this point + // All layers should have been associated with a valid compute device at this point. BOOST_ASSERT(layer.GetComputeDevice() != Compute::Undefined); - // Do not need another copy layer if copy layer is already present + // Does not need another copy layer if a copy layer is already present. return layer.GetType() != LayerType::MemCopy; }; @@ -191,14 +272,14 @@ void Graph::AddCopyLayers() unsigned int srcOutputIndex = 0; for (auto&& srcOutput : srcLayer->GetOutputSlots()) { - for (auto&& dstInput : srcOutput.GetConnections()) + std::vector<InputSlot*> connectionCopy = srcOutput.GetConnections(); + for (auto&& dstInput : connectionCopy) { Layer& dstLayer = dstInput->GetOwningLayer(); - if (MayNeedCopyLayer(dstLayer) && (dstLayer.GetComputeDevice() != srcLayer->GetComputeDevice())) { - // A copy layer is needed in between the source and destination layers - // Record the operation rather than attempting to modify the graph as we go + // A copy layer is needed in between the source and destination layers. + // Record the operation rather than attempting to modify the graph as we go. // (invalidating iterators) const std::string copyLayerName = boost::str(boost::format("[ %1% (%2%) -> %3% (%4%) ]") % srcLayer->GetName() diff --git a/src/armnn/Graph.hpp b/src/armnn/Graph.hpp index 06b6fd32ae..fd81e51b7b 100644 --- a/src/armnn/Graph.hpp +++ b/src/armnn/Graph.hpp @@ -5,6 +5,7 @@ #pragma once #include "LayersFwd.hpp" +#include "IGraphObservable.hpp" #include <armnn/Types.hpp> #include <armnn/TensorFwd.hpp> @@ -12,6 +13,7 @@ #include <armnn/Exceptions.hpp> #include <list> +#include <map> #include <unordered_map> #include <unordered_set> #include <vector> @@ -21,6 +23,7 @@ namespace armnn { + class Graph { public: @@ -31,7 +34,7 @@ public: } using LayersList = std::list<Layer*>; - using Iterator = LayersList::const_iterator; // const so pointers in the list can't be modified externally + using Iterator = LayersList::const_iterator; // Const so pointers in the list can't be modified externally. using ConstIterator = boost::transform_iterator<decltype(&PtrCast<const Layer>), Iterator>; using IteratorDifference = Iterator::difference_type; @@ -94,7 +97,7 @@ public: Status SerializeToDot(std::ostream& stream); - /// Adds a new layer of type LaterType to the graph constructed with the arguments passed. + /// Adds a new layer, of type LayerType, to the graph constructed with the arguments passed. template <typename LayerT, typename... Args> LayerT* AddLayer(Args&&... args); @@ -103,6 +106,10 @@ public: template <typename LayerT, typename... Args> LayerT* InsertNewLayer(InputSlot& insertBefore, Args&&... args); + /// Inserts a new layer between insertAfter and the input slot(s) currently connected to it + template <typename LayerT, typename... Args> + LayerT* InsertNewLayer(OutputSlot& insertAfter, Args&&... args); + /// Deletes the layer at the specified position and returns an iterator pointing /// to the next element after the one being deleted. Iterator EraseLayer(Iterator pos); @@ -113,22 +120,22 @@ public: template <typename LayerT> Iterator EraseLayer(LayerT*& layer); - /// Return iterator pointing to begin of list. Lowercase for range-based for loops. + /// Returns iterator pointing to the beginning of the list. Lowercase for range-based for loops. Iterator begin() { return m_Layers.begin(); } - /// Return iterator pointing to end of list. Lowercase for range-based for loops. + /// Returns iterator pointing to the end of the list. Lowercase for range-based for loops. Iterator end() { return m_Layers.end(); } - /// Return const iterator pointing to begin of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the beginning of the list. Lowercase for range-based for loops. ConstIterator begin() const { return {m_Layers.begin(), &PtrCast<const Layer>}; } - /// Return const iterator pointing to end of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the end of the list. Lowercase for range-based for loops. ConstIterator end() const { return {m_Layers.end(), &PtrCast<const Layer>}; } - /// Return const iterator pointing to begin of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the beginning of the list. Lowercase for range-based for loops. ConstIterator cbegin() const { return begin(); } - /// Return const iterator pointing to end of list. Lowercase for range-based for loops. + /// Returns const iterator pointing to the end of the list. Lowercase for range-based for loops. ConstIterator cend() const { return end(); } - /// Sort layers in topological order and return this. + /// Sorts layers in topological order and return this. Graph& TopologicalSort() { const_cast<const Graph*>(this)->TopologicalSort(); return *this; } const Graph& TopologicalSort() const; @@ -136,16 +143,16 @@ public: size_t GetNumOutputs() const { return m_OutputIds.size(); } /// Returns a wrapper object with begin(), end() methods to iterate over the input layers - /// in a range-based for loop + /// in a range-based for loop. InputLayersAccessor GetInputLayers() const { return InputLayersAccessor(*this); } /// Returns a wrapper object with begin(), end() methods to iterate over the output layers - /// in a range-based for loop + /// in a range-based for loop. OutputLayersAccessor GetOutputLayers() const { return OutputLayersAccessor(*this); } size_t GetNumLayers() const { return m_Layers.size(); } - /// Allocate memory for all tensors under output tensor handers of each layer + /// Allocates memory for all tensors under output tensor handers of each layer. Status AllocateDynamicBuffers(); /// Modifies the graph in-place, removing edges connecting layers using different compute devices, @@ -154,6 +161,14 @@ public: void InferTensorInfos(); + void AttachObservable(IGraphObservable* const observable, GraphEvent notifyOnEvent) { + m_Views[notifyOnEvent].emplace_back(observable); + } + + void DetachObservable(IGraphObservable* const observable, GraphEvent notifyOnEvent) { + m_Views[notifyOnEvent].remove(observable); + } + private: template <typename LayerT> class LayerInGraphBase; @@ -179,9 +194,18 @@ private: return it; } - /// Get the position of a layer in the graph. + /// Gets the position of a layer in the graph. Iterator GetPosInGraph(Layer& layer); + void NotifyObservables(GraphEvent event, Layer* graphState) + { + // Iterate over all observables observing this event + for (auto& observable : m_Views[event]) + { + observable->Update(graphState); + } + } + std::unordered_set<LayerBindingId> m_InputIds; std::unordered_set<LayerBindingId> m_OutputIds; std::unordered_map<const Layer*, Iterator> m_PosInGraphMap; @@ -189,9 +213,11 @@ private: /// Mutable to allow sorting on const object. mutable LayersList m_Layers; mutable bool m_LayersInOrder; + + std::map<const GraphEvent, std::list<IGraphObservable*>> m_Views; }; -/// Common base class for layers in the graph +/// Common base class for layers in the graph. template <typename LayerT> class Graph::LayerInGraphBase : public LayerT { @@ -212,7 +238,7 @@ protected: Graph& m_Graph; }; -/// Input/Output layers specialize this template +/// Input/Output layers specialize this template. template <typename LayerT> class Graph::LayerInGraph final : public LayerInGraphBase<LayerT> { @@ -305,24 +331,51 @@ inline LayerT* Graph::AddLayer(Args&&... args) { m_LayersInOrder = m_LayersInOrder && ((LayerEnumOf<LayerT>() == LayerType::Input) || (LayerEnumOf<LayerT>() == LayerType::Output)); - return new LayerInGraph<LayerT>(*this, std::forward<Args>(args)...); + LayerT* const layer = new LayerInGraph<LayerT>(*this, std::forward<Args>(args)...); + + NotifyObservables(GraphEvent::LayerAdded, layer); + + return layer; } template <typename LayerT, typename... Args> inline LayerT* Graph::InsertNewLayer(InputSlot& insertBefore, Args&&... args) { - // Insert after the parent if any, or before the child otherwise, so topological order is kept. + // Insert after the parent if any, or before the child otherwise, so the topological order is kept. OutputSlot* parentOut = insertBefore.GetConnectedOutputSlot(); const Iterator pos = (parentOut != nullptr) ? std::next(GetPosInGraph(parentOut->GetOwningLayer())) : GetPosInGraph(insertBefore.GetOwningLayer()); LayerT* const layer = new LayerInGraph<LayerT>(*this, pos, std::forward<Args>(args)...); insertBefore.Insert(*layer); + + NotifyObservables(GraphEvent::LayerAdded, layer); + + return layer; +} + +template <typename LayerT, typename... Args> +inline LayerT* Graph::InsertNewLayer(OutputSlot& insertAfter, Args&&... args) +{ + Layer& owningLayer = insertAfter.GetOwningLayer(); + + const Iterator pos = std::next(GetPosInGraph(owningLayer)); + LayerT* const layer = new LayerInGraph<LayerT>(*this, pos, std::forward<Args>(args)...); + + BOOST_ASSERT(layer->GetNumInputSlots() == 1); + + insertAfter.MoveAllConnections(layer->GetOutputSlot()); + insertAfter.Connect(layer->GetInputSlot(0)); + + NotifyObservables(GraphEvent::LayerAdded, layer); + return layer; } inline Graph::Iterator Graph::EraseLayer(Iterator pos) { + NotifyObservables(GraphEvent::LayerErased, *pos); + delete *pos; return m_Layers.erase(pos); } diff --git a/src/armnn/Half.hpp b/src/armnn/Half.hpp new file mode 100644 index 0000000000..4a10c3c8ab --- /dev/null +++ b/src/armnn/Half.hpp @@ -0,0 +1,35 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <type_traits> +#include <half/half.hpp> + +namespace armnn +{ + using Half = half_float::half; //import half float implementation +} //namespace armnn + + +namespace std +{ + +template<> +struct is_floating_point<armnn::Half> + : integral_constant< bool, true > +{}; + +template<> +struct is_floating_point<const armnn::Half> + : integral_constant< bool, true > +{}; + +template<> +struct is_floating_point<volatile armnn::Half> + : integral_constant< bool, true > +{}; + +} //namespace std
\ No newline at end of file diff --git a/src/armnn/IGraphObservable.hpp b/src/armnn/IGraphObservable.hpp new file mode 100644 index 0000000000..f1779ec1da --- /dev/null +++ b/src/armnn/IGraphObservable.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "Layer.hpp" + +namespace armnn +{ + +enum class GraphEvent +{ + LayerAdded, + LayerErased +}; + +class IGraphObservable +{ +public: + virtual void Update(Layer* graphLayer) = 0; + +protected: + virtual ~IGraphObservable() = default; +}; + +} //namespace armnn + diff --git a/src/armnn/Instrument.hpp b/src/armnn/Instrument.hpp new file mode 100644 index 0000000000..8d3ac5a76c --- /dev/null +++ b/src/armnn/Instrument.hpp @@ -0,0 +1,66 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <string> +#include <vector> + +namespace armnn +{ + +struct Measurement +{ + enum Unit + { + TIME_NS, + TIME_US, + TIME_MS, + }; + + inline static const char* ToString(Unit unit) + { + switch (unit) + { + case TIME_NS: return "ns"; + case TIME_US: return "us"; + case TIME_MS: return "ms"; + default: return ""; + } + } + + Measurement(const std::string& name, double value, Unit unit) + : m_Name(name) + , m_Value(value) + , m_Unit(unit) + {} + Measurement(const Measurement&) = default; + ~Measurement() = default; + + std::string m_Name; + double m_Value; + Unit m_Unit; + +private: + // please don't default construct, otherwise Units will be wrong + Measurement() = delete; +}; + +class Instrument +{ +public: + virtual ~Instrument() {} + + virtual void Start() = 0; + + virtual void Stop() = 0; + + virtual std::vector<Measurement> GetMeasurements() const = 0; + + virtual const char* GetName() const = 0; + +}; + +} //namespace armnn diff --git a/src/armnn/InternalTypes.cpp b/src/armnn/InternalTypes.cpp index e39b15be05..3426da3d24 100644 --- a/src/armnn/InternalTypes.cpp +++ b/src/armnn/InternalTypes.cpp @@ -18,6 +18,8 @@ char const* GetLayerTypeAsCString(LayerType type) case LayerType::Addition: return "Addition"; case LayerType::BatchNormalization: return "BatchNormalization"; case LayerType::Constant: return "Constant"; + case LayerType::ConvertFp16ToFp32: return "ConvertFp16ToFp32"; + case LayerType::ConvertFp32ToFp16: return "ConvertFp32ToFp16"; case LayerType::Convolution2d: return "Convolution2d"; case LayerType::DepthwiseConvolution2d: return "DepthwiseConvolution2d"; case LayerType::FakeQuantization: return "FakeQuantization"; @@ -25,6 +27,7 @@ char const* GetLayerTypeAsCString(LayerType type) case LayerType::FullyConnected: return "FullyConnected"; case LayerType::Input: return "Input"; case LayerType::L2Normalization: return "L2Normalization"; + case LayerType::Lstm: return "Lstm"; case LayerType::MemCopy: return "MemCopy"; case LayerType::Merger: return "Merger"; case LayerType::Multiplication: return "Multiplication"; diff --git a/src/armnn/InternalTypes.hpp b/src/armnn/InternalTypes.hpp index 8db0da4cf2..0968e17b18 100644 --- a/src/armnn/InternalTypes.hpp +++ b/src/armnn/InternalTypes.hpp @@ -18,6 +18,8 @@ enum class LayerType Addition, BatchNormalization, Constant, + ConvertFp16ToFp32, + ConvertFp32ToFp16, Convolution2d, DepthwiseConvolution2d, FakeQuantization, @@ -25,6 +27,7 @@ enum class LayerType FullyConnected, Input, L2Normalization, + Lstm, MemCopy, Merger, Multiplication, @@ -35,7 +38,7 @@ enum class LayerType Reshape, ResizeBilinear, Softmax, - // Last layer goes here + // Last layer goes here. LastLayer, Splitter = LastLayer, }; diff --git a/src/armnn/JsonPrinter.cpp b/src/armnn/JsonPrinter.cpp new file mode 100644 index 0000000000..f7c1c68758 --- /dev/null +++ b/src/armnn/JsonPrinter.cpp @@ -0,0 +1,134 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "JsonPrinter.hpp" + +#include <iomanip> +#include <iostream> + +namespace armnn +{ + +void JsonPrinter::PrintJsonChildObject(const JsonChildObject& object) +{ + PrintLabel(object.m_Label); + PrintMeasurementsList(object.m_Measurements); + PrintSeparator(); + PrintNewLine(); + PrintUnit(object.m_Unit); + + if (!object.m_Children.empty()) + { + PrintSeparator(); + PrintNewLine(); + for (unsigned int childIndex = 0; childIndex < object.m_Children.size(); ++childIndex) + { + PrintJsonChildObject(object.m_Children[childIndex]); + // Only print separator and new line if current child is not the last element. + if (&object.m_Children[childIndex] != &object.m_Children.back()) + { + PrintSeparator(); + PrintNewLine(); + } + } + } + PrintNewLine(); + PrintFooter(); +} + +void JsonPrinter::PrintHeader() +{ + m_OutputStream << "{" << std::endl; + IncrementNumberOfTabs(); +} + +void JsonPrinter::PrintArmNNHeader() +{ + PrintTabs(); + m_OutputStream << R"("ArmNN": {)" << std::endl; + IncrementNumberOfTabs(); +} + +void JsonPrinter::PrintLabel(const std::string& label) +{ + PrintTabs(); + m_OutputStream << R"(")" << label << R"(": {)" << std::endl; + IncrementNumberOfTabs(); +} + +void JsonPrinter::PrintUnit(armnn::Measurement::Unit unit) +{ + PrintTabs(); + m_OutputStream << R"("unit": ")"; + m_OutputStream << armnn::Measurement::ToString(unit); + m_OutputStream << R"(")"; +} + +void JsonPrinter::PrintMeasurementsList(const std::vector<double>& measurementsVector) +{ + if (measurementsVector.empty()) + { + return; + } + + PrintTabs(); + m_OutputStream << R"("raw": [)" << std::endl; + IncrementNumberOfTabs(); + PrintTabs(); + auto iter = measurementsVector.begin(); + m_OutputStream << *iter; + for (iter = std::next(iter); iter != measurementsVector.end(); ++iter) + { + m_OutputStream << "," << std::endl; + PrintTabs(); + m_OutputStream << *iter; + } + m_OutputStream << std::endl; + DecrementNumberOfTabs(); + PrintTabs(); + m_OutputStream << "]"; +} + +void JsonPrinter::PrintTabs() +{ + unsigned int numTabs = m_NumTabs; + while (numTabs-- > 0) + { + m_OutputStream << "\t"; + } +} + +void JsonPrinter::PrintSeparator() +{ + m_OutputStream << ","; +} + +void JsonPrinter::PrintNewLine() +{ + m_OutputStream << std::endl; +} + +void JsonPrinter::PrintFooter() +{ + DecrementNumberOfTabs(); + PrintTabs(); + m_OutputStream << "}"; +} + +void JsonPrinter::DecrementNumberOfTabs() +{ + if (m_NumTabs == 0) + { + return; + } + --m_NumTabs; +} + +void JsonPrinter::IncrementNumberOfTabs() +{ + ++m_NumTabs; +} + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/JsonPrinter.hpp b/src/armnn/JsonPrinter.hpp new file mode 100644 index 0000000000..1bf9e3175b --- /dev/null +++ b/src/armnn/JsonPrinter.hpp @@ -0,0 +1,82 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <ostream> +#include <string.h> +#include <map> + +#include "Instrument.hpp" + +namespace armnn +{ + +struct JsonChildObject +{ + JsonChildObject(const std::string& label) + : m_Label(label), m_Unit(Measurement::Unit::TIME_MS) + {} + JsonChildObject(const JsonChildObject&) = default; + + void AddMeasurement(const double measurement) + { + m_Measurements.push_back(measurement); + } + + void AddChild(const JsonChildObject& childObject) + { + m_Children.push_back(childObject); + } + + JsonChildObject GetChild(const unsigned int index) + { + return m_Children[index]; + } + + void SetUnit(const Measurement::Unit unit) + { + m_Unit = unit; + } + + ~JsonChildObject() = default; + + std::string m_Label; + Measurement::Unit m_Unit; + std::vector<double> m_Measurements; + std::vector<JsonChildObject> m_Children; + +private: + JsonChildObject() = delete; +}; + +class JsonPrinter +{ +public: + void PrintJsonChildObject(const JsonChildObject& object); + void PrintHeader(); + void PrintArmNNHeader(); + void PrintFooter(); + void PrintSeparator(); + void PrintNewLine(); + void PrintLabel(const std::string& label); + void PrintUnit(armnn::Measurement::Unit unit); + void PrintMeasurementsList(const std::vector<double>& measurementsVector); + +public: + JsonPrinter(std::ostream &outputStream) + : m_OutputStream(outputStream), m_NumTabs(0) + {} + +private: + void PrintTabs(); + void DecrementNumberOfTabs(); + void IncrementNumberOfTabs(); + + std::ostream &m_OutputStream; + unsigned int m_NumTabs; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/Layer.cpp b/src/armnn/Layer.cpp index fcf0656aeb..9f6d75c46b 100644 --- a/src/armnn/Layer.cpp +++ b/src/armnn/Layer.cpp @@ -10,6 +10,7 @@ #include <boost/cast.hpp> #include <boost/format.hpp> #include <boost/log/trivial.hpp> +#include "backends/CpuTensorHandle.hpp" #include <numeric> @@ -24,19 +25,19 @@ void InputSlot::Insert(Layer& layer) if (prevSlot != nullptr) { - // Disconnect parent from this + // Disconnects parent from this. prevSlot->Disconnect(*this); - // Connect inserted layer to parent + // Connects inserted layer to parent. BOOST_ASSERT(layer.GetNumInputSlots() == 1); prevSlot->Connect(layer.GetInputSlot(0)); - // Set tensor info for inserted layer + // Sets tensor info for inserted layer. const TensorInfo& tensorInfo = prevSlot->GetTensorInfo(); layer.GetOutputHandler().SetTensorInfo(tensorInfo); } - // Connect inserted layer to this + // Connects inserted layer to this. layer.GetOutputSlot(0).Connect(*this); } @@ -117,11 +118,11 @@ void OutputSlot::ValidateConnectionIndex(unsigned int index) const namespace { LayerGuid GenerateLayerGuid() { - //Note: Not thread safe. + // Note: Not thread safe. static LayerGuid newGuid=0; return newGuid++; } -} //namespace +} // namespace Layer::Layer(unsigned int numInputSlots, unsigned int numOutputSlots, LayerType type, const char* name) : m_OutputHandlers(numOutputSlots) @@ -147,7 +148,7 @@ void Layer::CollectWorkloadInputs(WorkloadDataCollector& dataCollector, const Gr { for (auto&& inputSlot : GetInputSlots()) { - // The graph must be well-formed at this point + // The graph must be well-formed at this point. BOOST_ASSERT(inputSlot.GetConnection()); const OutputHandler& outputHandler = inputSlot.GetConnectedOutputSlot()->GetOutputHandler(); dataCollector.Push(outputHandler.GetData(), outputHandler.GetTensorInfo()); @@ -170,13 +171,22 @@ void Layer::CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory) } } +void Layer::ReleaseConstantData() +{ + // Now free up the static data. + OperateOnConstantTensors([](std::unique_ptr<ScopedCpuTensorHandle>& handle) + { + handle.reset(nullptr); + }); +} + DataType Layer::GetDataType() const { - if (GetNumInputSlots() > 0) // Ignore the input layer + if (GetNumInputSlots() > 0) // Ignore the input layer. { return GetInputSlot(0).GetConnection()->GetTensorInfo().GetDataType(); } - return DataType::Float32; + return GetOutputSlot(0).GetTensorInfo().GetDataType(); } void Layer::ResetPriority() const @@ -226,4 +236,64 @@ LayerPriority Layer::GetPriority() const return m_Priority; } +void Layer::VerifyLayerConnections(unsigned int expectedConnections, const CheckLocation& location) const +{ + BOOST_ASSERT(GetNumInputSlots() == expectedConnections); + + for (unsigned int i=0; i<expectedConnections; ++i) + { + if (GetInputSlot(i).GetConnection() == nullptr) + { + throw LayerValidationException( + boost::str( + boost::format( + "Input connection #%1% must be connected " + "for %2% layer %3% %4%") + % i + % GetLayerTypeAsCString(this->GetType()) + % GetNameStr() + % location.AsString())); + } + if(! GetInputSlot(i).GetConnection()->IsTensorInfoSet()) + { + throw LayerValidationException( + boost::str( + boost::format( + "TensorInfo of Input connection #%1% must be set on connected OutputSlot for " + "%2% layer %3% %4%") + % i + % GetLayerTypeAsCString(this->GetType()) + % GetNameStr() + % location.AsString())); + } + } +} + +std::vector<TensorShape> Layer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const +{ + BOOST_ASSERT(GetNumInputSlots() != 0); + BOOST_ASSERT(GetNumOutputSlots() != 0); + + // By default we return what we got, meaning the output shape(s) are the same as the input(s). + // This only works if the number of inputs and outputs are the same. Since we are in the Layer + // base class, this means the implementation needs to be overridden in the specific layers for + // the other cases. So the missing implementation justifies the UnimplementedException. + + if (GetNumInputSlots() != GetNumOutputSlots()) + { + throw UnimplementedException( + boost::str( + boost::format( + "Default implementation for InferOutputShapes can only be used for " + "layers with the same number of input and output slots. This doesn't " + "hold for %1% layer %2% (#inputs=%3% #outputs=%4%) %5%") + % GetLayerTypeAsCString(this->GetType()) + % GetNameStr() + % GetNumInputSlots() + % GetNumOutputSlots() + % CHECK_LOCATION().AsString())); + } + return inputShapes; +} + } // namespace armnn diff --git a/src/armnn/Layer.hpp b/src/armnn/Layer.hpp index 2a199afc24..ebd6b251b4 100644 --- a/src/armnn/Layer.hpp +++ b/src/armnn/Layer.hpp @@ -21,6 +21,8 @@ #include <string> #include <vector> #include <iostream> +#include <functional> +#include <list> #include <boost/numeric/conversion/cast.hpp> #include <boost/core/ignore_unused.hpp> @@ -51,7 +53,7 @@ public: const OutputSlot* GetConnectedOutputSlot() const { return m_Connection; } OutputSlot* GetConnectedOutputSlot() { return m_Connection; } - /// Links the slot to an output slot or breaks an existing link if passing nullptr + /// Links the slot to an output slot or breaks an existing link if passing nullptr. void SetConnection(OutputSlot* source) { if (m_Connection != nullptr && source != nullptr) @@ -62,7 +64,7 @@ public: m_Connection = source; } - // Insert single-output existing layer at this point in the graph. + // Inserts single-output existing layer at this point in the graph. void Insert(Layer& layer); // IInputSlot @@ -113,10 +115,10 @@ public: bool ValidateTensorShape(const TensorShape& shape) const; - // Disconnect all conections + // Disconnect all conections. void DisconnectAll(); - /// Move all connections to another OutputSlot + /// Moves all connections to another OutputSlot. void MoveAllConnections(OutputSlot& destination); // IOutputSlot @@ -147,7 +149,7 @@ private: std::vector<InputSlot*> m_Connections; }; -// InputSlot inlines that need OutputSlot declaration +// InputSlot inlines that need OutputSlot declaration. inline InputSlot::~InputSlot() { @@ -172,6 +174,9 @@ inline InputSlot::~InputSlot() inline const IOutputSlot* InputSlot::GetConnection() const { return GetConnectedOutputSlot(); } inline IOutputSlot* InputSlot::GetConnection() { return GetConnectedOutputSlot(); } + +class ScopedCpuTensorHandle; + // Base layer class using LayerPriority = unsigned int; @@ -179,7 +184,7 @@ using LayerPriority = unsigned int; class Layer : public IConnectableLayer { public: - /// @param name Optional name for the layer (may be nullptr) + /// @param name - Optional name for the layer (may be nullptr). Layer(unsigned int numInputSlots, unsigned int numOutputSlots, LayerType type, const char* name); const std::string& GetNameStr() const @@ -200,15 +205,15 @@ public: const std::vector<InputSlot>& GetInputSlots() const { return m_InputSlots; } const std::vector<OutputSlot>& GetOutputSlots() const { return m_OutputSlots; } - // Allow non-const access to input slots, but don't expose vector (vector size is fixed at layer construction). + // Allows non-const access to input slots, but don't expose vector (vector size is fixed at layer construction). std::vector<InputSlot>::iterator BeginInputSlots() { return m_InputSlots.begin(); } std::vector<InputSlot>::iterator EndInputSlots() { return m_InputSlots.end(); } - // Allow non-const access to output slots, but don't expose vector (vector size is fixed at layer construction). + // Allows non-const access to output slots, but don't expose vector (vector size is fixed at layer construction). std::vector<OutputSlot>::iterator BeginOutputSlots() { return m_OutputSlots.begin(); } std::vector<OutputSlot>::iterator EndOutputSlots() { return m_OutputSlots.end(); } - // Check whether the outputs of this layer don't have any connection + // Checks whether the outputs of this layer don't have any connection. bool IsOutputUnconnected() { unsigned int numConnections = 0; @@ -221,7 +226,7 @@ public: return (GetNumOutputSlots() > 0) && (numConnections == 0); } - // Used for sorting + // Used for sorting. void ResetPriority() const; LayerPriority GetPriority() const; @@ -238,16 +243,35 @@ public: virtual void CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory); - /// Creates a dynamically-allocated copy of this layer - /// @param graph The Graph into which this Layer is being cloned + /// Creates a dynamically-allocated copy of this layer. + /// @param graph - The Graph into which this Layer is being cloned. virtual Layer* Clone(Graph& graph) const = 0; + void VerifyLayerConnections(unsigned int expectedConnections, const CheckLocation& location) const; + virtual void ValidateTensorShapesFromInputs() = 0; - /// Helper to serialize the layer parameters to string - /// (currently used in DotSerializer and company) + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; + + /// Helper to serialize the layer parameters to string. + /// (currently used in DotSerializer and company). virtual void SerializeLayerParameters(ParameterStringifyFunction &) const {} + // Free up the constant source data + virtual void ReleaseConstantData(); + + template<typename Op> + void OperateOnConstantTensors(Op op) + { + for (auto constant : GetConstantTensorsByRef()) + { + if (constant.get()) + { + op(constant); + } + } + }; + // IConnectableLayer const char* GetName() const override { return m_LayerName.c_str(); } @@ -263,8 +287,12 @@ public: void SetGuid(LayerGuid guid) { m_Guid = guid; } LayerGuid GetGuid() const final { return m_Guid; } + void AddRelatedLayerName(const std::string layerName) { m_RelatedLayerNames.emplace_back(layerName); } + + const std::list<std::string>& GetRelatedLayerNames() { return m_RelatedLayerNames; } + protected: - // Graph needs access to the virtual destructor + // Graph needs access to the virtual destructor. friend class Graph; virtual ~Layer() = default; @@ -282,7 +310,7 @@ protected: CollectWorkloadOutputs(dataCollector, graph); } - /// Helper function to reduce duplication in *Layer::CreateWorkload + /// Helper function to reduce duplication in *Layer::CreateWorkload. template <typename QueueDescriptor> WorkloadInfo PrepInfoAndDesc(QueueDescriptor& descriptor, const Graph& graph) const { @@ -295,6 +323,10 @@ protected: template <typename LayerType, typename ... Params> LayerType* CloneBase(Graph& graph, Params&& ... params) const; + // Retrieve the Handles to the constants + using ConstantTensors = std::vector<std::reference_wrapper<std::unique_ptr<ScopedCpuTensorHandle>>>; + virtual ConstantTensors GetConstantTensorsByRef() {return ConstantTensors(); }; + private: void CollectWorkloadInputs(WorkloadDataCollector& dataCollector, const Graph& graph) const; void CollectWorkloadOutputs(WorkloadDataCollector& dataCollector, const Graph& graph) const; @@ -311,14 +343,16 @@ private: const LayerType m_Type; Compute m_ComputeDevice; - /// Used for sorting + /// Used for sorting. mutable LayerPriority m_Priority = 0; mutable bool m_Visiting = false; LayerGuid m_Guid; + + std::list<std::string> m_RelatedLayerNames; }; -// A layer user-provided data can be bound to (e.g. inputs, outputs) +// A layer user-provided data can be bound to (e.g. inputs, outputs). class BindableLayer : public Layer { public: diff --git a/src/armnn/LayerSupport.cpp b/src/armnn/LayerSupport.cpp index a0f6276e2b..a734e03a56 100644 --- a/src/armnn/LayerSupport.cpp +++ b/src/armnn/LayerSupport.cpp @@ -16,20 +16,20 @@ namespace armnn { -// Helper function to copy a full string to a truncated version +/// Helper function to copy a full string to a truncated version. void CopyErrorMessage(char* truncatedString, const char* fullString, size_t maxLength) { if(truncatedString != nullptr) { size_t copyLength = std::min(maxLength, strlen(fullString)); std::strncpy(truncatedString, fullString, copyLength); - // Ensure null-terminated string + // Ensure null-terminated string. truncatedString[copyLength] = '\0'; } } // Helper macro to avoid code duplication. -// Forwards function func to funcRef, funcNeon or funcCl, depending on the value of compute +// Forwards function func to funcRef, funcNeon or funcCl, depending on the value of compute. #define FORWARD_LAYER_SUPPORT_FUNC(compute, func, ...) \ std::string reasonIfUnsupportedFull; \ bool isSupported; \ @@ -58,11 +58,12 @@ bool CheckTensorDataTypesEqual(const TensorInfo& input0, const TensorInfo& input bool IsActivationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsActivationSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsActivationSupported, input, output, descriptor); } bool IsAdditionSupported(Compute compute, @@ -82,11 +83,24 @@ bool IsAdditionSupported(Compute compute, bool IsBatchNormalizationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsBatchNormalizationSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, + IsBatchNormalizationSupported, + input, + output, + mean, + var, + beta, + gamma, + descriptor); } bool IsConstantSupported(Compute compute, @@ -97,6 +111,24 @@ bool IsConstantSupported(Compute compute, FORWARD_LAYER_SUPPORT_FUNC(compute, IsConstantSupported, output); } +bool IsConvertFp16ToFp32Supported(Compute compute, + const TensorInfo& input, + const TensorInfo& output, + char* reasonIfUnsupported, + size_t reasonIfUnsupportedMaxLength) +{ + FORWARD_LAYER_SUPPORT_FUNC(compute, IsConvertFp16ToFp32Supported, input, output); +} + +bool IsConvertFp32ToFp16Supported(Compute compute, + const TensorInfo& input, + const TensorInfo& output, + char* reasonIfUnsupported, + size_t reasonIfUnsupportedMaxLength) +{ + FORWARD_LAYER_SUPPORT_FUNC(compute, IsConvertFp32ToFp16Supported, input, output); +} + bool IsConvolution2dSupported(Compute compute, const TensorInfo& input, const TensorInfo& output, @@ -111,12 +143,14 @@ bool IsConvolution2dSupported(Compute compute, bool IsDepthwiseConvolutionSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsDepthwiseConvolutionSupported, input, descriptor, weights); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsDepthwiseConvolutionSupported, input, output, descriptor, weights, biases); } bool IsInputSupported(Compute compute, @@ -129,21 +163,51 @@ bool IsInputSupported(Compute compute, bool IsFullyConnectedSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsFullyConnectedSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsFullyConnectedSupported, input, output, weights, biases, descriptor); } bool IsL2NormalizationSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsL2NormalizationSupported, input); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsL2NormalizationSupported, input, output); } +bool IsLstmSupported(Compute compute, const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, char* reasonIfUnsupported, + size_t reasonIfUnsupportedMaxLength) + +{ + FORWARD_LAYER_SUPPORT_FUNC(compute, IsLstmSupported, input, outputStateIn, cellStateIn, + scratchBuffer, outputStateOut, cellStateOut, + output, descriptor, inputToForgetWeights, inputToCellWeights, + inputToOutputWeights, recurrentToForgetWeights, + recurrentToCellWeights, recurrentToOutputWeights, + forgetGateBias, cellBias, outputGateBias, + inputToInputWeights, recurrentToInputWeights, + cellToInputWeights, inputGateBias, projectionWeights, + projectionBias, cellToForgetWeights, cellToOutputWeights); +} bool IsMergerSupported(Compute compute, std::vector<const TensorInfo*> inputs, const OriginsDescriptor& descriptor, @@ -157,10 +221,11 @@ bool IsMergerSupported(Compute compute, bool IsMultiplicationSupported(Compute compute, const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsMultiplicationSupported, input0, input1); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsMultiplicationSupported, input0, input1, output); } bool IsNormalizationSupported(Compute compute, @@ -211,11 +276,12 @@ bool IsResizeBilinearSupported(Compute compute, bool IsSoftmaxSupported(Compute compute, const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - FORWARD_LAYER_SUPPORT_FUNC(compute, IsSoftmaxSupported, input, descriptor); + FORWARD_LAYER_SUPPORT_FUNC(compute, IsSoftmaxSupported, input, output, descriptor); } bool IsSplitterSupported(Compute compute, @@ -250,7 +316,7 @@ bool IsFloorSupported(Compute compute, char* reasonIfUnsupported, size_t reasonIfUnsupportedMaxLength) { - // By definition (that is, regardless of compute device), shapes and data type must match + // By definition (that is, regardless of compute device), shapes and data type must match. if (input.GetShape() != output.GetShape() || input.GetDataType() != output.GetDataType()) { return false; diff --git a/src/armnn/LayerSupportCommon.hpp b/src/armnn/LayerSupportCommon.hpp index 5b7feac387..63065c0565 100644 --- a/src/armnn/LayerSupportCommon.hpp +++ b/src/armnn/LayerSupportCommon.hpp @@ -11,17 +11,20 @@ namespace armnn { -template<typename Float32Func, typename Uint8Func, typename ... Params> +template<typename Float16Func, typename Float32Func, typename Uint8Func, typename ... Params> bool IsSupportedForDataTypeGeneric(std::string* reasonIfUnsupported, DataType dataType, - Float32Func floatFuncPtr, + Float16Func float16FuncPtr, + Float32Func float32FuncPtr, Uint8Func uint8FuncPtr, Params&&... params) { switch(dataType) { + case DataType::Float16: + return float16FuncPtr(reasonIfUnsupported, std::forward<Params>(params)...); case DataType::Float32: - return floatFuncPtr(reasonIfUnsupported, std::forward<Params>(params)...); + return float32FuncPtr(reasonIfUnsupported, std::forward<Params>(params)...); case DataType::QuantisedAsymm8: return uint8FuncPtr(reasonIfUnsupported, std::forward<Params>(params)...); default: @@ -42,6 +45,16 @@ bool FalseFunc(std::string* reasonIfUnsupported, Params&&... params) } template<typename ... Params> +bool FalseFuncF16(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float16 data type"; + } + return false; +} + +template<typename ... Params> bool FalseFuncF32(std::string* reasonIfUnsupported, Params&&... params) { if (reasonIfUnsupported) @@ -61,4 +74,44 @@ bool FalseFuncU8(std::string* reasonIfUnsupported, Params&&... params) return false; } +template<typename ... Params> +bool FalseInputFuncF32(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float32 data type input"; + } + return false; +} + +template<typename ... Params> +bool FalseInputFuncF16(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float16 data type input"; + } + return false; +} + +template<typename ... Params> +bool FalseOutputFuncF32(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float32 data type output"; + } + return false; +} + +template<typename ... Params> +bool FalseOutputFuncF16(std::string* reasonIfUnsupported, Params&&... params) +{ + if (reasonIfUnsupported) + { + *reasonIfUnsupported = "Layer is not supported with float16 data type output"; + } + return false; +} + } diff --git a/src/armnn/LayersFwd.hpp b/src/armnn/LayersFwd.hpp index 64d5dcea9b..e79149f28f 100644 --- a/src/armnn/LayersFwd.hpp +++ b/src/armnn/LayersFwd.hpp @@ -10,6 +10,8 @@ #include "layers/AdditionLayer.hpp" #include "layers/BatchNormalizationLayer.hpp" #include "layers/ConstantLayer.hpp" +#include "layers/ConvertFp16ToFp32Layer.hpp" +#include "layers/ConvertFp32ToFp16Layer.hpp" #include "layers/Convolution2dLayer.hpp" #include "layers/DepthwiseConvolution2dLayer.hpp" #include "layers/FakeQuantizationLayer.hpp" @@ -17,6 +19,7 @@ #include "layers/FullyConnectedLayer.hpp" #include "layers/InputLayer.hpp" #include "layers/L2NormalizationLayer.hpp" +#include "layers/LstmLayer.hpp" #include "layers/MemCopyLayer.hpp" #include "layers/MergerLayer.hpp" #include "layers/MultiplicationLayer.hpp" @@ -60,6 +63,8 @@ DECLARE_LAYER(Activation) DECLARE_LAYER(Addition) DECLARE_LAYER(BatchNormalization) DECLARE_LAYER(Constant) +DECLARE_LAYER(ConvertFp16ToFp32) +DECLARE_LAYER(ConvertFp32ToFp16) DECLARE_LAYER(Convolution2d) DECLARE_LAYER(DepthwiseConvolution2d) DECLARE_LAYER(FakeQuantization) @@ -67,6 +72,7 @@ DECLARE_LAYER(Floor) DECLARE_LAYER(FullyConnected) DECLARE_LAYER(Input) DECLARE_LAYER(L2Normalization) +DECLARE_LAYER(Lstm) DECLARE_LAYER(MemCopy) DECLARE_LAYER(Merger) DECLARE_LAYER(Multiplication) diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp index 3c73d4ccfe..e1f8de3d88 100644 --- a/src/armnn/LoadedNetwork.cpp +++ b/src/armnn/LoadedNetwork.cpp @@ -27,30 +27,54 @@ namespace armnn using namespace std; +namespace +{ + +template <typename ExceptionType> +std::string ToErrorMessage(const char * prefix, const ExceptionType & error) +{ + std::stringstream ss; + ss << prefix << " " << error.what(); + return ss.str(); +} + +#if ARMCOMPUTECL_ENABLED +std::string ToErrorMessage(const char * prefix, const cl::Error& error) +{ + std::stringstream ss; + ss << prefix << " " << error.what() << ". CL error code is: " << error.err(); + return ss.str(); +} +#endif + +} // anonymous + std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr<OptimizedNetwork> net, - bool useCpuRefAsFallback) + std::string & errorMessage) { std::unique_ptr<LoadedNetwork> loadedNetwork; try { - loadedNetwork.reset(new LoadedNetwork(std::move(net), useCpuRefAsFallback)); + loadedNetwork.reset(new LoadedNetwork(std::move(net))); } catch (const std::runtime_error& error) { - BOOST_LOG_TRIVIAL(error) << "An error occurred when preparing the network workloads: " << error.what(); + errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error); + BOOST_LOG_TRIVIAL(error) << errorMessage; return std::unique_ptr<LoadedNetwork>(); } catch (const armnn::Exception& error) { - BOOST_LOG_TRIVIAL(error) << "An error occurred when preparing the network workloads: " << error.what(); + errorMessage = ToErrorMessage("An error occurred when preparing the network workloads: ", error); + BOOST_LOG_TRIVIAL(error) << errorMessage; return std::unique_ptr<LoadedNetwork>(); } #if ARMCOMPUTECL_ENABLED catch (const cl::Error& error) { - BOOST_LOG_TRIVIAL(error) << "A CL error occurred attempting to prepare a network workload: " - << error.what() << ". CL error code is: " << error.err(); + errorMessage = ToErrorMessage("A CL error occurred attempting to prepare a network workload: ", error); + BOOST_LOG_TRIVIAL(error) << errorMessage; return std::unique_ptr<LoadedNetwork>(); } #endif @@ -58,21 +82,25 @@ std::unique_ptr<LoadedNetwork> LoadedNetwork::MakeLoadedNetwork(std::unique_ptr< return loadedNetwork; } -LoadedNetwork::LoadedNetwork(std::unique_ptr<OptimizedNetwork> net, bool useCpuRefAsFallback) - : m_CpuRef(useCpuRefAsFallback) +LoadedNetwork::LoadedNetwork(std::unique_ptr<OptimizedNetwork> net) + : m_CpuRef() , m_OptimizedNetwork(std::move(net)) { + // Create a profiler and register it for the current thread. + m_Profiler = std::make_shared<Profiler>(); + ProfilerManager::GetInstance().RegisterProfiler(m_Profiler.get()); + Graph& order = m_OptimizedNetwork->GetGraph().TopologicalSort(); - //first create tensor handlers - //handlers are created before workloads are - //because workload creation can modify some of the handlers - //(for example the splitter and merger layers) + //First create tensor handlers. + //Handlers are created before workloads are. + //Because workload creation can modify some of the handlers, + //(for example the splitter and merger layers). for (auto&& layer : order) { layer->CreateTensorHandles(m_OptimizedNetwork->GetGraph(), GetWorkloadFactory(*layer)); } - //then create workloads + //Then create workloads. for (auto&& layer : order) { const IWorkloadFactory& workloadFactory = GetWorkloadFactory(*layer); @@ -82,7 +110,7 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<OptimizedNetwork> net, bool useCpuR case LayerType::Input: case LayerType::Output: { - // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput() + // Inputs and outputs are treated in a special way - see EnqueueInput() and EnqueueOutput(). break; } default: @@ -99,15 +127,17 @@ LoadedNetwork::LoadedNetwork(std::unique_ptr<OptimizedNetwork> net, bool useCpuR } m_WorkloadQueue.push_back(move(workload)); + // release the constant data in the layer.. + layer->ReleaseConstantData(); break; } } } - // set up memory + // Set up memory. m_OptimizedNetwork->GetGraph().AllocateDynamicBuffers(); - // finalize the workload factories before execution + // Finalize the workload factories before execution. m_CpuRef.Finalize(); m_CpuAcc.Finalize(); m_GpuAcc.Finalize(); @@ -159,17 +189,20 @@ const IWorkloadFactory& LoadedNetwork::GetWorkloadFactory(const Layer& layer) co break; } case Compute::CpuRef: - default: { workloadFactory = &m_CpuRef; break; } + default: + { + break; + } } BOOST_ASSERT_MSG(workloadFactory, "No workload factory"); std::string reasonIfUnsupported; - BOOST_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, layer.GetDataType(), reasonIfUnsupported), + BOOST_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(layer, {}, reasonIfUnsupported), "Factory does not support layer"); boost::ignore_unused(reasonIfUnsupported); @@ -273,19 +306,18 @@ private: Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors) { - ARMNN_UPDATE_PROFILING_EVENT_TAG(); ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "EnqueueWorkload"); const Graph& graph = m_OptimizedNetwork->GetGraph(); - // Walk graph to determine the order of execution + // Walk graph to determine the order of execution. if (graph.GetNumLayers() < 2) { BOOST_LOG_TRIVIAL(warning) << "IRuntime::EnqueueWorkload()::Less than two nodes in graph"; return Status::Failure; } - // Data that must be kept alive for the entire execution of the workload + // Data that must be kept alive for the entire execution of the workload. WorkloadData workloadData(inputTensors, outputTensors); if (graph.GetNumInputs() != inputTensors.size()) @@ -293,14 +325,14 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, throw InvalidArgumentException("Number of inputs provided does not match network."); } - // for each input to the network, call EnqueueInput with the data passed by the user + // For each input to the network, call EnqueueInput with the data passed by the user. for (const BindableLayer* inputLayer : graph.GetInputLayers()) { const TensorPin& pin = workloadData.GetInputTensorPin(inputLayer->GetBindingId()); EnqueueInput(*inputLayer, pin.GetTensorHandle(), pin.GetTensorInfo()); } - // for each output to the network, call EnqueueOutput with the data passed by the user + // For each output to the network, call EnqueueOutput with the data passed by the user. for (const BindableLayer* outputLayer : graph.GetOutputLayers()) { const TensorPin& pin = workloadData.GetOutputTensorPin(outputLayer->GetBindingId()); @@ -315,7 +347,7 @@ Status LoadedNetwork::EnqueueWorkload(const InputTensors& inputTensors, executionSucceeded = Execute(); } - // Hack: get rid of inputs and outputs we added + // Hack: get rid of inputs and outputs we added. TidyWorkloadQueue(graph.GetNumInputs(), graph.GetNumOutputs()); return executionSucceeded ? Status::Success : Status::Failure; @@ -374,7 +406,7 @@ void LoadedNetwork::EnqueueOutput(const BindableLayer& layer, ITensorHandle* ten BOOST_ASSERT_MSG(layer.GetNumInputSlots() == 1, "Output Layer should have exactly one input."); - // Get the output handler from the previous node + // Gets the output handler from the previous node. const OutputHandler& outputHandler = layer.GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler(); const TensorInfo& inputTensorInfo = outputHandler.GetTensorInfo(); @@ -394,6 +426,10 @@ bool LoadedNetwork::Execute() { bool success = true; + m_CpuRef.Acquire(); + m_CpuAcc.Acquire(); + m_GpuAcc.Acquire(); + try { for (size_t i = 0; i < m_WorkloadQueue.size(); ++i) @@ -415,6 +451,11 @@ bool LoadedNetwork::Execute() success = false; } + // Informs the memory managers to release memory in it's respective memory group + m_CpuRef.Release(); + m_CpuAcc.Release(); + m_GpuAcc.Release(); + return success; } diff --git a/src/armnn/LoadedNetwork.hpp b/src/armnn/LoadedNetwork.hpp index 79a0b267e9..286f804234 100644 --- a/src/armnn/LoadedNetwork.hpp +++ b/src/armnn/LoadedNetwork.hpp @@ -8,6 +8,7 @@ #include "armnn/Types.hpp" #include "Network.hpp" #include "LayerFwd.hpp" +#include "Profiling.hpp" #include "backends/RefWorkloadFactory.hpp" #include "backends/NeonWorkloadFactory.hpp" #include "backends/ClWorkloadFactory.hpp" @@ -33,10 +34,15 @@ public: Status EnqueueWorkload(const InputTensors& inputTensors, const OutputTensors& outputTensors); static std::unique_ptr<LoadedNetwork> MakeLoadedNetwork(std::unique_ptr<OptimizedNetwork> net, - bool useCpuRefAsFallback); + std::string & errorMessage); + + // NOTE we return by reference as the purpose of this method is only to provide + // access to the private m_Profiler and in theory we should not need to increment + // the shared_ptr's reference counter + const std::shared_ptr<Profiler>& GetProfiler() const { return m_Profiler; } private: - LoadedNetwork(std::unique_ptr<OptimizedNetwork> net, bool useCpuRefAsFallback); + LoadedNetwork(std::unique_ptr<OptimizedNetwork> net); void EnqueueInput(const BindableLayer& layer, ITensorHandle* tensorHandle, const TensorInfo& tensorInfo); @@ -54,6 +60,7 @@ private: std::unique_ptr<OptimizedNetwork> m_OptimizedNetwork; std::vector< std::unique_ptr<IWorkload> > m_WorkloadQueue; + std::shared_ptr<Profiler> m_Profiler; }; } diff --git a/src/armnn/NeonInterceptorScheduler.cpp b/src/armnn/NeonInterceptorScheduler.cpp new file mode 100644 index 0000000000..fc95ef439e --- /dev/null +++ b/src/armnn/NeonInterceptorScheduler.cpp @@ -0,0 +1,57 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonInterceptorScheduler.hpp" + +#include <boost/assert.hpp> + +namespace armnn{ + +NeonInterceptorScheduler::NeonInterceptorScheduler(NeonTimer::KernelMeasurements& kernels, + arm_compute::IScheduler &realScheduler) + : m_Kernels(kernels), m_RealScheduler(realScheduler) +{ +} + +void NeonInterceptorScheduler::set_num_threads(unsigned int numThreads) +{ + m_RealScheduler.set_num_threads(numThreads); +} + +unsigned int NeonInterceptorScheduler::num_threads() const +{ + return m_RealScheduler.num_threads(); +} + +void NeonInterceptorScheduler::schedule(arm_compute::ICPPKernel* kernel, const Hints& hints) +{ + m_Timer.Start(); + m_RealScheduler.schedule(kernel, hints.split_dimension()); + m_Timer.Stop(); + + std::vector<Measurement> measurements = m_Timer.GetMeasurements(); + BOOST_ASSERT(!measurements.empty()); + + Measurement measurement(measurements.front()); // NOTE: 1st measurement is delta + measurement.m_Name = kernel->name(); + m_Kernels.push_back(std::move(measurement)); +} + +void NeonInterceptorScheduler::run_workloads(std::vector <Workload>& workloads) +{ + m_Timer.Start(); + m_RealScheduler.run_workloads(workloads); + m_Timer.Stop(); + + std::vector<Measurement> measurements = m_Timer.GetMeasurements(); + BOOST_ASSERT_MSG(measurements.size() == 3, "WallClockTimer does not have correct amount of measurements."); + + // WallClockTimer has 3 measurements, duration always being the first. + Measurement measurement(measurements.front()); + measurement.m_Name = "Workload"; + m_Kernels.push_back(std::move(measurement)); +} + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/NeonInterceptorScheduler.hpp b/src/armnn/NeonInterceptorScheduler.hpp new file mode 100644 index 0000000000..b8ecbd59c2 --- /dev/null +++ b/src/armnn/NeonInterceptorScheduler.hpp @@ -0,0 +1,37 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "NeonTimer.hpp" +#include "WallClockTimer.hpp" + +#include <arm_compute/runtime/IScheduler.h> +#include <arm_compute/runtime/Scheduler.h> +#include <arm_compute/core/CPP/ICPPKernel.h> + +namespace armnn +{ + +class NeonInterceptorScheduler : public arm_compute::IScheduler +{ +public: + NeonInterceptorScheduler(NeonTimer::KernelMeasurements &kernels, arm_compute::IScheduler &realScheduler); + ~NeonInterceptorScheduler() = default; + + void set_num_threads(unsigned int numThreads) override; + + unsigned int num_threads() const override; + + void schedule(arm_compute::ICPPKernel *kernel, const Hints &hints) override; + + void run_workloads(std::vector<Workload> &workloads) override; + +private: + NeonTimer::KernelMeasurements& m_Kernels; + arm_compute::IScheduler& m_RealScheduler; + WallClockTimer m_Timer; +}; + +} // namespace armnn diff --git a/src/armnn/NeonTimer.cpp b/src/armnn/NeonTimer.cpp new file mode 100644 index 0000000000..0c1e2e6a34 --- /dev/null +++ b/src/armnn/NeonTimer.cpp @@ -0,0 +1,56 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonTimer.hpp" +#include "NeonInterceptorScheduler.hpp" + +#include <memory> + +#include <boost/assert.hpp> +#include <boost/format.hpp> + +namespace armnn +{ + +void NeonTimer::Start() +{ + m_Kernels.clear(); + m_RealSchedulerType = arm_compute::Scheduler::get_type(); + //Note: We can't currently replace a custom scheduler + if(m_RealSchedulerType != arm_compute::Scheduler::Type::CUSTOM) + { + // Keep the real schedule and add NeonInterceptorScheduler as an interceptor + m_RealScheduler = &arm_compute::Scheduler::get(); + auto interceptor = std::make_shared<NeonInterceptorScheduler>(m_Kernels, *m_RealScheduler); + arm_compute::Scheduler::set(std::static_pointer_cast<arm_compute::IScheduler>(interceptor)); + } +} + +void NeonTimer::Stop() +{ + // Restore real scheduler + arm_compute::Scheduler::set(m_RealSchedulerType); + m_RealScheduler = nullptr; +} + +std::vector<Measurement> NeonTimer::GetMeasurements() const +{ + std::vector<Measurement> measurements = m_Kernels; + unsigned int kernel_number = 0; + for (auto & kernel : measurements) + { + std::string kernelName = std::string(this->GetName()) + "/" + std::to_string(kernel_number++) + ": " + kernel + .m_Name; + kernel.m_Name = kernelName; + } + return measurements; +} + +const char* NeonTimer::GetName() const +{ + return "NeonKernelTimer"; +} + +} diff --git a/src/armnn/NeonTimer.hpp b/src/armnn/NeonTimer.hpp new file mode 100644 index 0000000000..5685c4a6fe --- /dev/null +++ b/src/armnn/NeonTimer.hpp @@ -0,0 +1,43 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Instrument.hpp" + +#include <arm_compute/runtime/IScheduler.h> +#include <arm_compute/runtime/Scheduler.h> +#include <arm_compute/core/CPP/ICPPKernel.h> + +#include <chrono> +#include <map> +#include <list> + +namespace armnn +{ + +class NeonTimer : public Instrument +{ +public: + using KernelMeasurements = std::vector<Measurement>; + + NeonTimer() = default; + ~NeonTimer() = default; + + void Start() override; + + void Stop() override; + + std::vector<Measurement> GetMeasurements() const override; + + const char* GetName() const override; + +private: + KernelMeasurements m_Kernels; + arm_compute::IScheduler* m_RealScheduler; + arm_compute::Scheduler::Type m_RealSchedulerType; +}; + +}
\ No newline at end of file diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp index 0a5325c2a4..f510207c06 100644 --- a/src/armnn/Network.cpp +++ b/src/armnn/Network.cpp @@ -5,16 +5,21 @@ #include "Network.hpp" #include "Graph.hpp" #include "Layer.hpp" +#include "DeviceSpec.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/WorkloadFactory.hpp" #include "Optimizer.hpp" +#include "armnn/Exceptions.hpp" #include <armnn/Utils.hpp> +#include <armnn/TypesUtils.hpp> #include <fcntl.h> #include <algorithm> #include <fstream> #include <memory> +#include <vector> +#include <algorithm> #include <boost/assert.hpp> #include <boost/format.hpp> @@ -22,6 +27,8 @@ #include <boost/numeric/conversion/converter_policies.hpp> #include <boost/cast.hpp> +#include "optimizations/All.hpp" + namespace armnn { @@ -62,43 +69,195 @@ Status OptimizedNetwork::SerializeToDot(std::ostream& stream) const return m_Graph->SerializeToDot(stream); } -IOptimizedNetworkPtr Optimize(const INetwork& inNetwork, const DeviceSpec& deviceSpec) +IOptimizedNetworkPtr Optimize(const INetwork& inNetwork, + const std::vector<armnn::Compute>& backendPreferences, + const IDeviceSpec& deviceSpec, + const OptimizerOptions& options) { + if (backendPreferences.empty()) { + throw armnn::InvalidArgumentException("Invoked Optimize with no backends specified"); + } const Network& network = *boost::polymorphic_downcast<const Network*>(&inNetwork); std::unique_ptr<Graph> graph = std::make_unique<Graph>(network.GetGraph()); - OptimizedNetwork* optNet = new OptimizedNetwork(std::move(graph)); + auto optNet = IOptimizedNetworkPtr(new OptimizedNetwork(std::move(graph)), &IOptimizedNetwork::Destroy); - Optimizer::Optimize(optNet->GetGraph()); + OptimizedNetwork* optNetObjPtr = boost::polymorphic_downcast<OptimizedNetwork*>(optNet.get()); + + // Perform optimisation passes + using namespace optimizations; + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(SquashEqualPermuteSiblings(), + SquashEqualReshapeSiblings(), + OptimizeInversePermutes(), + MovePermuteUp(), + PermuteAsReshape(), + OptimizeConsecutiveReshapes())); // Infer the tensor infos for all output slots. Throws an exception on failure. - optNet->GetGraph().InferTensorInfos(); + optNetObjPtr->GetGraph().InferTensorInfos(); - // Assign a compute device for all nodes - for (auto&& layer : optNet->GetGraph()) + // if Fp32 to Fp16 optimization is set convert Fp32 network to Fp16 + if (options.m_ReduceFp32ToFp16) { - DataType dataType = layer->GetDataType(); + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(Fp32NetworkToFp16Converter())); + } + + // We know that DeviceSpec should be the only implementation of IDeviceSpec. + const DeviceSpec& spec = *boost::polymorphic_downcast<const DeviceSpec*>(&deviceSpec); + + // determine which of the preferred backends we have available for use + // and whether we have specified CpuRef as one of those backends. + bool cpuRefUsed = false; + std::vector<armnn::Compute> availablePreferredBackends; + for (const armnn::Compute& backend : backendPreferences) + { + // Check if the backend is in the available backend devices. + if (std::find(spec.m_SupportedComputeDevices.begin(), + spec.m_SupportedComputeDevices.end(), backend) != + spec.m_SupportedComputeDevices.end()) + { + availablePreferredBackends.push_back(backend); + if (armnn::Compute::CpuRef == backend) { + cpuRefUsed = true; + } + } + } + if (availablePreferredBackends.empty()) { + BOOST_LOG_TRIVIAL(warning) << "None of the preferred backends " << backendPreferences + << " are supported. Current platform provides " << spec.m_SupportedComputeDevices; + return {nullptr, &IOptimizedNetwork::Destroy}; + } - // Default to the user-requested compute device from the Runtime - layer->SetComputeDevice(deviceSpec.DefaultComputeDevice); + auto ReturnWithError = [&](Layer* layer) + { + BOOST_LOG_TRIVIAL(warning) << "Layer of type " << GetLayerTypeAsCString(layer->GetType()) + << " is not supported on any preferred backend " << backendPreferences; + return IOptimizedNetworkPtr(nullptr, &IOptimizedNetwork::Destroy); + }; - // If the layer is unsupported by this device, fall back to reference + // Assign a compute device for all nodes + for (auto&& layer : optNetObjPtr->GetGraph()) + { + DataType dataType = layer->GetDataType(); std::string reasonIfUnsupported; - if (!IWorkloadFactory::IsLayerSupported(*layer, dataType, reasonIfUnsupported)) + bool found = false; + for (const armnn::Compute& backend : availablePreferredBackends) { - BOOST_LOG_TRIVIAL(warning) << "Layer of type " << GetLayerTypeAsCString(layer->GetType()) << - " is not supported on requested backend " << layer->GetComputeDevice() << " (reason: " << - reasonIfUnsupported << "), falling back to CpuRef backend."; - layer->SetComputeDevice(Compute::CpuRef); + // need to set the compute device on the layer + // before we can check if it is supported + layer->SetComputeDevice(backend); + if (!IWorkloadFactory::IsLayerSupported(*layer, dataType, reasonIfUnsupported)) + { + if (dataType == DataType::Float16) + { + if (IWorkloadFactory::IsLayerSupported(*layer, DataType::Float32, reasonIfUnsupported) + && layer->GetType() != LayerType::ConvertFp32ToFp16 + && layer->GetType() != LayerType::ConvertFp16ToFp32) + { + // Insert FP16 -> FP32 conversion layer before current layer + std::vector<ConvertFp16ToFp32Layer*> convertFp16ToFp32Layers = + InsertConvertFp16ToFp32LayersBefore(optNetObjPtr->GetGraph(), *layer); + + // Insert FP32 -> FP16 conversion layer after current layer + std::vector<ConvertFp32ToFp16Layer*> convertFp32ToFp16Layers = + InsertConvertFp32ToFp16LayersAfter(optNetObjPtr->GetGraph(), *layer); + + // Assign a supported backend to the newly introduced conversion layers + auto AssignFirstSupportedBackend = [&](Layer* layer, Compute preferredBackend) + { + bool supportedBackendFound = false; + std::string reasonIfUnsupported; + + // Try preferred backend first + layer->SetComputeDevice(preferredBackend); + if (IWorkloadFactory::IsLayerSupported(*layer, boost::none, reasonIfUnsupported)) + { + supportedBackendFound = true; + } + else + { + for (const Compute& backend : availablePreferredBackends) + { + // Skip preferred backend (we already determined that it is not supported) + if (backend == preferredBackend) + { + continue; + } + + layer->SetComputeDevice(backend); + if (IWorkloadFactory::IsLayerSupported(*layer, boost::none, reasonIfUnsupported)) + { + supportedBackendFound = true; + break; + } + } + } + + return supportedBackendFound; + }; + + for (ConvertFp16ToFp32Layer* convertLayer : convertFp16ToFp32Layers) + { + if (!AssignFirstSupportedBackend(convertLayer, backend)) + { + return ReturnWithError(convertLayer); + } + } + + for (ConvertFp32ToFp16Layer* convertLayer : convertFp32ToFp16Layers) + { + if (!AssignFirstSupportedBackend(convertLayer, backend)) + { + return ReturnWithError(convertLayer); + } + } + + found = true; + break; + } + } + BOOST_LOG_TRIVIAL(warning) << "Layer of type " << GetLayerTypeAsCString(layer->GetType()) + << " is not supported on requested backend " << layer->GetComputeDevice() + << " (reason: " << reasonIfUnsupported + << "), falling back to the next backend."; + } + else + { + found = true; + break; + } } - BOOST_ASSERT_MSG(IWorkloadFactory::IsLayerSupported(*layer, dataType, reasonIfUnsupported), - "Layer has no valid compute device"); + // If the layer is unsupported by any devices, log and return a null network. + if (!found) { + // NOTE: if the layer is not an operation queue type AND we have not got CpuRef as a + // fallback we should set the compute device on the layer to CpuRef (these are not + // available as accelerated operations, or are only available under certain + // conditions, currently they comprise MemCopy, Constant, Permute) + armnn::LayerType layerType = layer->GetType(); + if (!cpuRefUsed && (layerType == armnn::LayerType::MemCopy || + layerType == armnn::LayerType::Constant || + layerType == armnn::LayerType::Permute)) + { + layer->SetComputeDevice(armnn::Compute::CpuRef); + } + else + { + return ReturnWithError(layer); + } + } } - optNet->GetGraph().AddCopyLayers(); + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(OptimizeInverseConversionsFp16(), + OptimizeInverseConversionsFp32())); + + optNetObjPtr->GetGraph().AddCopyLayers(); + + // Convert constants + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(ConvertConstantsFloatToHalf())); + Optimizer::Pass(optNetObjPtr->GetGraph(), MakeOptimizations(ConvertConstantsHalfToFloat())); - return {optNet, &IOptimizedNetwork::Destroy}; + return optNet; } Network::Network() @@ -116,9 +275,9 @@ IConnectableLayer* Network::AddInputLayer(LayerBindingId id, const char* name) } IConnectableLayer* Network::AddFullyConnectedLayerImpl(const FullyConnectedDescriptor& fullyConnectedDescriptor, - const ConstTensor& weights, - const ConstTensor* biases, - const char* name) + const ConstTensor& weights, + const ConstTensor* biases, + const char* name) { if (fullyConnectedDescriptor.m_BiasEnabled && (biases == nullptr)) { @@ -138,24 +297,24 @@ IConnectableLayer* Network::AddFullyConnectedLayerImpl(const FullyConnectedDescr } IConnectableLayer* Network::AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor, - const ConstTensor& weights, - const char* name) + const ConstTensor& weights, + const char* name) { return AddFullyConnectedLayerImpl(fullyConnectedDescriptor, weights, nullptr, name); } IConnectableLayer* Network::AddFullyConnectedLayer(const FullyConnectedDescriptor& fullyConnectedDescriptor, - const ConstTensor& weights, - const ConstTensor& biases, - const char* name) + const ConstTensor& weights, + const ConstTensor& biases, + const char* name) { return AddFullyConnectedLayerImpl(fullyConnectedDescriptor, weights, &biases, name); } IConnectableLayer* Network::AddConvolution2dLayerImpl(const Convolution2dDescriptor& convolution2dDescriptor, - const ConstTensor& weights, - const ConstTensor* biases, - const char* name) + const ConstTensor& weights, + const ConstTensor* biases, + const char* name) { if (convolution2dDescriptor.m_BiasEnabled && (biases == nullptr)) { @@ -175,15 +334,15 @@ IConnectableLayer* Network::AddConvolution2dLayerImpl(const Convolution2dDescrip } IConnectableLayer* Network::AddConvolution2dLayer(const Convolution2dDescriptor& convolution2dDescriptor, - const ConstTensor& weights, - const char* name) + const ConstTensor& weights, + const char* name) { return AddConvolution2dLayerImpl(convolution2dDescriptor, weights, nullptr, name); } IConnectableLayer* Network::AddConvolution2dLayer(const Convolution2dDescriptor& convolution2dDescriptor, - const ConstTensor& weights, - const ConstTensor& biases, - const char* name) + const ConstTensor& weights, + const ConstTensor& biases, + const char* name) { return AddConvolution2dLayerImpl(convolution2dDescriptor, weights, &biases, name); } @@ -199,7 +358,8 @@ IConnectableLayer* Network::AddDepthwiseConvolution2dLayerImpl( throw InvalidArgumentException("AddDepthwiseConvolution2dLayer: biases cannot be NULL"); } - const auto layer = m_Graph->AddLayer<DepthwiseConvolution2dLayer>(convolution2dDescriptor, name); + const auto layer = m_Graph->AddLayer<DepthwiseConvolution2dLayer>(convolution2dDescriptor, + name); layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(weights); @@ -245,7 +405,8 @@ IConnectableLayer* Network::AddActivationLayer(const ActivationDescriptor& activ return m_Graph->AddLayer<ActivationLayer>(activationDescriptor, name); } -IConnectableLayer* Network::AddNormalizationLayer(const NormalizationDescriptor& normalizationDescriptor, +IConnectableLayer* Network::AddNormalizationLayer(const NormalizationDescriptor& +normalizationDescriptor, const char* name) { return m_Graph->AddLayer<NormalizationLayer>(normalizationDescriptor, name); @@ -301,7 +462,8 @@ IConnectableLayer* Network::AddBatchNormalizationLayer(const BatchNormalizationD return layer; } -IConnectableLayer* Network::AddResizeBilinearLayer(const ResizeBilinearDescriptor& resizeDescriptor, const char* name) +IConnectableLayer* Network::AddResizeBilinearLayer(const ResizeBilinearDescriptor& +resizeDescriptor, const char* name) { return m_Graph->AddLayer<ResizeBilinearLayer>(resizeDescriptor,name); } @@ -313,10 +475,15 @@ IConnectableLayer* Network::AddL2NormalizationLayer(const char* name) IConnectableLayer* Network::AddConstantLayer(const ConstTensor& input, const char* name) { - return m_Graph->AddLayer<ConstantLayer>(std::make_shared<ScopedCpuTensorHandle>(input), name); + auto layer = m_Graph->AddLayer<ConstantLayer>(name); + + layer->m_LayerOutput = std::make_unique<ScopedCpuTensorHandle>(input); + + return layer; } -IConnectableLayer* Network::AddReshapeLayer(const ReshapeDescriptor& reshapeDescriptor, const char* name) +IConnectableLayer* Network::AddReshapeLayer(const ReshapeDescriptor& reshapeDescriptor, + const char* name) { return m_Graph->AddLayer<ReshapeLayer>(reshapeDescriptor, name); } @@ -326,6 +493,97 @@ IConnectableLayer* Network::AddFloorLayer(const char* name) return m_Graph->AddLayer<FloorLayer>(name); } +IConnectableLayer* Network::AddLstmLayer(const LstmDescriptor& descriptor, + const LstmInputParams& params, + const char* name) +{ + const auto layer = m_Graph->AddLayer<LstmLayer>(descriptor, name); + + //Lstm Basic Parameters + layer->m_BasicParameters.m_InputToForgetWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_InputToForgetWeights)); + layer->m_BasicParameters.m_InputToCellWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_InputToCellWeights)); + layer->m_BasicParameters.m_InputToOutputWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_InputToOutputWeights)); + layer->m_BasicParameters.m_RecurrentToForgetWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_RecurrentToForgetWeights)); + layer->m_BasicParameters.m_RecurrentToCellWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_RecurrentToCellWeights)); + layer->m_BasicParameters.m_RecurrentToOutputWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_RecurrentToOutputWeights)); + layer->m_BasicParameters.m_ForgetGateBias = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_ForgetGateBias)); + layer->m_BasicParameters.m_CellBias = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_CellBias)); + layer->m_BasicParameters.m_OutputGateBias = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_OutputGateBias)); + + //Lstm Cifg parameters + if(!descriptor.m_CifgEnabled) + { + if(params.m_InputToInputWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Input To Input Weights cannot be NULL"); + } + if(params.m_RecurrentToInputWeights == nullptr) + { + throw InvalidArgumentException( + "AddLstmLayer: Recurrent To Input Weights cannot be NULL"); + } + if(params.m_InputGateBias == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Input Gate Bias cannot be NULL"); + } + layer->m_CifgParameters.m_InputToInputWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_InputToInputWeights)); + layer->m_CifgParameters.m_RecurrentToInputWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_RecurrentToInputWeights)); + // In the VTS tests, cell-to-input weights may be null, even if the other CIFG params are not. + if(params.m_CellToInputWeights != nullptr) + { + layer->m_CifgParameters.m_CellToInputWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_CellToInputWeights)); + } + layer->m_CifgParameters.m_InputGateBias = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_InputGateBias)); + } + + //Lstm projection parameters + if(descriptor.m_ProjectionEnabled) + { + if(params.m_ProjectionWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Projection Weights cannot be NULL"); + } + layer->m_ProjectionParameters.m_ProjectionWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_ProjectionWeights)); + if(params.m_ProjectionBias != nullptr) + { + layer->m_ProjectionParameters.m_ProjectionBias = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_ProjectionBias)); + } + } + + //Lstm Peephole params + if(descriptor.m_PeepholeEnabled) + { + if(params.m_CellToForgetWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Cell To Forget Weights cannot be NULL"); + } + if(params.m_CellToOutputWeights == nullptr) + { + throw InvalidArgumentException("AddLstmLayer: Cell To Output Weights cannot be NULL"); + } + layer->m_PeepholeParameters.m_CellToForgetWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_CellToForgetWeights)); + layer->m_PeepholeParameters.m_CellToOutputWeights = + std::make_unique<ScopedCpuTensorHandle>(*(params.m_CellToOutputWeights)); + } + return layer; +} + OptimizedNetwork::OptimizedNetwork(std::unique_ptr<Graph> graph) : m_Graph(std::move(graph)) { @@ -336,4 +594,3 @@ OptimizedNetwork::~OptimizedNetwork() } } // namespace armnn - diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp index 4eb67b1a15..72100aae6c 100644 --- a/src/armnn/Network.hpp +++ b/src/armnn/Network.hpp @@ -5,6 +5,7 @@ #pragma once #include <armnn/DescriptorsFwd.hpp> +#include <armnn/LstmParams.hpp> #include <armnn/TensorFwd.hpp> #include <armnn/Types.hpp> @@ -20,7 +21,7 @@ namespace armnn { class Graph; -/// Private implementation of INetwork +/// Private implementation of INetwork. class Network final : public INetwork { public: @@ -108,6 +109,10 @@ public: IConnectableLayer* AddOutputLayer(LayerBindingId id, const char* name = nullptr) override; + IConnectableLayer* AddLstmLayer(const LstmDescriptor& descriptor, + const LstmInputParams& params, + const char* name = nullptr) override; + private: IConnectableLayer* AddFullyConnectedLayerImpl(const FullyConnectedDescriptor& fullyConnectedDescriptor, const ConstTensor& weights, diff --git a/src/armnn/NetworkUtils.hpp b/src/armnn/NetworkUtils.hpp new file mode 100644 index 0000000000..0228813a25 --- /dev/null +++ b/src/armnn/NetworkUtils.hpp @@ -0,0 +1,79 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Graph.hpp" + +namespace armnn +{ + +inline std::vector<ConvertFp16ToFp32Layer*> InsertConvertFp16ToFp32LayersBefore(Graph& graph, Layer& layer) +{ + std::vector<ConvertFp16ToFp32Layer*> convertLayers; + convertLayers.reserve(layer.GetNumInputSlots()); + + for (auto&& inputSlot = layer.BeginInputSlots(); inputSlot != layer.EndInputSlots(); ++inputSlot) + { + // Insert FP16 to FP32 converter layer before the layer + const std::string name = + std::string("convert_fp16_to_fp32-" + std::to_string(inputSlot->GetSlotIndex()) + "-") + layer.GetName(); + ConvertFp16ToFp32Layer* convertLayer = + graph.InsertNewLayer<ConvertFp16ToFp32Layer>(*inputSlot, name.c_str()); + + // Sets output tensor info for the convert layer + TensorInfo convertInfo = convertLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); + convertInfo.SetDataType(DataType::Float32); + + convertLayer->GetOutputSlot().SetTensorInfo(convertInfo); + + convertLayers.emplace_back(convertLayer); + } + + // Sets the output tensor info for the unsupported layer + auto UpdateTensorInfo = [](auto& outputSlot) + { + // Copy original tensor info and change data type to FP32 + TensorInfo newTensorInfo = outputSlot.GetTensorInfo(); + newTensorInfo.SetDataType(DataType::Float32); + + outputSlot.SetTensorInfo(newTensorInfo); + }; + + std::for_each(layer.BeginOutputSlots(), layer.EndOutputSlots(), UpdateTensorInfo); + + return convertLayers; +} + +inline std::vector<ConvertFp32ToFp16Layer*> InsertConvertFp32ToFp16LayersAfter(Graph& graph, Layer& layer) +{ + std::vector<ConvertFp32ToFp16Layer*> convertLayers; + convertLayers.reserve(layer.GetNumOutputSlots()); + + int index = 0; + // Change outputs to DataType::Float16 + for (auto&& outputSlot = layer.BeginOutputSlots(); outputSlot != layer.EndOutputSlots(); ++outputSlot) + { + BOOST_ASSERT(outputSlot->GetTensorInfo().GetDataType() == DataType::Float32); + + // Insert FP32 to FP16 converter layer after the layer + const std::string name = + std::string("convert_fp32_to_fp16-" + std::to_string(index++) + "-") + layer.GetName(); + ConvertFp32ToFp16Layer* convertLayer = + graph.InsertNewLayer<ConvertFp32ToFp16Layer>(*outputSlot, name.c_str()); + + // Sets output tensor info for the convert layer. + TensorInfo convertInfo = convertLayer->GetInputSlot(0).GetConnectedOutputSlot()->GetTensorInfo(); + convertInfo.SetDataType(DataType::Float16); + + convertLayer->GetOutputSlot().SetTensorInfo(convertInfo); + + convertLayers.emplace_back(convertLayer); + } + + return convertLayers; +} + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/Observable.cpp b/src/armnn/Observable.cpp new file mode 100644 index 0000000000..7179a10ccd --- /dev/null +++ b/src/armnn/Observable.cpp @@ -0,0 +1,36 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "Observable.hpp" + +namespace armnn +{ + +void AddedLayerObservable::Update(Layer* graphLayer) +{ + m_ObservedObjects.emplace_back(graphLayer); +} + +void ErasedLayerNamesObservable::Update(Layer* graphLayer) +{ + auto& relatedLayerNames = graphLayer->GetRelatedLayerNames(); + + // If the erased layer has no related layers we take the erased layer's name + // Otherwise we need to preserve the related layer names, + // since we want to preserve the original graph's information + if (relatedLayerNames.empty()) + { + m_ObservedObjects.emplace_back(graphLayer->GetName()); + } + else + { + for (auto& relatedLayerName : relatedLayerNames) + { + m_ObservedObjects.emplace_back(relatedLayerName); + } + } +} + +} diff --git a/src/armnn/Observable.hpp b/src/armnn/Observable.hpp new file mode 100644 index 0000000000..8f33c0b3e3 --- /dev/null +++ b/src/armnn/Observable.hpp @@ -0,0 +1,67 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IGraphObservable.hpp" +#include "Graph.hpp" + +namespace armnn +{ + +template <typename ObservedType> +class GraphObservable : public IGraphObservable +{ +public: + using Iterator = typename std::list<ObservedType>::const_iterator; + + GraphObservable(Graph& subject, GraphEvent notifyOnEvent) + : m_Subject(&subject) + { + m_NotifyOnEvent = notifyOnEvent; + m_Subject->AttachObservable(this, m_NotifyOnEvent); + }; + + void Clear() { m_ObservedObjects.clear(); }; + + Iterator begin() { return m_ObservedObjects.begin(); } + + Iterator end() { return m_ObservedObjects.end(); } + +protected: + ~GraphObservable() + { + if (m_Subject) + { + m_Subject->DetachObservable(this, m_NotifyOnEvent); + } + } + + GraphEvent m_NotifyOnEvent; + Graph* m_Subject; + std::list<ObservedType> m_ObservedObjects; +}; + +class AddedLayerObservable : public GraphObservable<Layer*> +{ +public: + explicit AddedLayerObservable(Graph& subject) + : GraphObservable<Layer*>(subject, GraphEvent::LayerAdded) + {}; + + void Update(Layer* graphLayer) override; +}; + +class ErasedLayerNamesObservable : public GraphObservable<std::string> +{ +public: + explicit ErasedLayerNamesObservable(Graph& subject) + : GraphObservable<std::string>(subject, GraphEvent::LayerErased) + {}; + + void Update(Layer* graphLayer) override; +}; + +} //namespace armnn + diff --git a/src/armnn/OpenClTimer.cpp b/src/armnn/OpenClTimer.cpp new file mode 100644 index 0000000000..8559fefafd --- /dev/null +++ b/src/armnn/OpenClTimer.cpp @@ -0,0 +1,105 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "OpenClTimer.hpp" + +#include <string> +#include <sstream> + +namespace armnn +{ + +OpenClTimer::OpenClTimer() +{ +} + +void OpenClTimer::Start() +{ + m_Kernels.clear(); + + auto interceptor = [this]( cl_command_queue command_queue, + cl_kernel kernel, + cl_uint work_dim, + const size_t *gwo, + const size_t *gws, + const size_t *lws, + cl_uint num_events_in_wait_list, + const cl_event * event_wait_list, + cl_event * event) + { + cl_int retVal = 0; + + // Get the name of the kernel + cl::Kernel retainedKernel(kernel, true); + std::stringstream ss; + ss << retainedKernel.getInfo<CL_KERNEL_FUNCTION_NAME>(); + + // Embed workgroup sizes into the name + if(gws != nullptr) + { + ss << " GWS[" << gws[0] << "," << gws[1] << "," << gws[2] << "]"; + } + if(lws != nullptr) + { + ss << " LWS[" << lws[0] << "," << lws[1] << "," << lws[2] << "]"; + } + + cl_event customEvent; + + // Forward to original OpenCl function + retVal = m_OriginalEnqueueFunction( command_queue, + kernel, + work_dim, + gwo, + gws, + lws, + num_events_in_wait_list, + event_wait_list, + &customEvent); + + // Store the Kernel info for later GetMeasurements() call + m_Kernels.emplace_back(ss.str(), customEvent); + + return retVal; + }; + + m_OriginalEnqueueFunction = CLSymbols::get().clEnqueueNDRangeKernel_ptr; + CLSymbols::get().clEnqueueNDRangeKernel_ptr = interceptor; +} + +void OpenClTimer::Stop() +{ + CLSymbols::get().clEnqueueNDRangeKernel_ptr = m_OriginalEnqueueFunction; +} + +std::vector<Measurement> OpenClTimer::GetMeasurements() const +{ + std::vector<Measurement> measurements; + + cl_command_queue_properties clQueueProperties = CLScheduler::get().queue().getInfo<CL_QUEUE_PROPERTIES>(); + + int idx = 0; + for (auto& kernel : m_Kernels) + { + std::string name = std::string(this->GetName()) + "/" + std::to_string(idx++) + ": " + kernel.m_Name; + + double timeUs = 0.0; + if((clQueueProperties & CL_QUEUE_PROFILING_ENABLE) != 0) + { + // Wait for the event to finish before accessing profile results. + kernel.m_Event.wait(); + + cl_ulong start = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_START>(); + cl_ulong end = kernel.m_Event.getProfilingInfo<CL_PROFILING_COMMAND_END>(); + timeUs = static_cast<double>(end - start) / 1000.0; + } + + measurements.emplace_back(name, timeUs, Measurement::Unit::TIME_US); + } + + return measurements; +} + +} //namespace armnn diff --git a/src/armnn/OpenClTimer.hpp b/src/armnn/OpenClTimer.hpp new file mode 100644 index 0000000000..09d7a8b949 --- /dev/null +++ b/src/armnn/OpenClTimer.hpp @@ -0,0 +1,59 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Instrument.hpp" + +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/core/CL/OpenCL.h" + +#include <vector> +#include <list> + +namespace armnn +{ + +/// OpenClTimer instrument that times all OpenCl kernels executed between calls to Start() and Stop(). +class OpenClTimer : public Instrument +{ +public: + OpenClTimer(); + ~OpenClTimer() = default; + + /// Start the OpenCl timer + void Start() override; + + /// Stop the OpenCl timer + void Stop() override; + + /// Get the name of the timer + /// \return Name of the timer + const char* GetName() const override { return "OpenClKernelTimer"; } + + /// Get the recorded measurements. This will be a list of the execution durations for all the OpenCl kernels. + /// \return Recorded measurements + std::vector<Measurement> GetMeasurements() const override; + +private: + using CLScheduler = arm_compute::CLScheduler; + using CLSymbols = arm_compute::CLSymbols; + using ClEvent = cl::Event; + using ClEnqueueFunc = decltype(CLSymbols::clEnqueueNDRangeKernel_ptr); + + /// Stores info about the OpenCl kernel + struct KernelInfo + { + KernelInfo(const std::string& name, cl_event& event) : m_Name(name), m_Event(event) {} + + std::string m_Name; + ClEvent m_Event; + }; + + std::list<KernelInfo> m_Kernels; ///< List of all kernels executed + ClEnqueueFunc m_OriginalEnqueueFunction; ///< Keep track of original OpenCl function +}; + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/Optimizer.cpp b/src/armnn/Optimizer.cpp index 9b76c7fa72..630aa1a27b 100644 --- a/src/armnn/Optimizer.cpp +++ b/src/armnn/Optimizer.cpp @@ -3,6 +3,7 @@ // See LICENSE file in the project root for full license information. // #include "Optimizer.hpp" +#include "Observable.hpp" #include "optimizations/All.hpp" namespace armnn @@ -10,44 +11,50 @@ namespace armnn Optimizer::Optimizer() { - // Add optimizations here - static optimizations::SquashEqualPermuteSiblings squashEqualPermuteSiblings; - static optimizations::SquashEqualReshapeSiblings squashEqualReshapeSiblings; - static optimizations::OptimizeInversePermutes optimizeInversePermutes; - static optimizations::MovePermuteUp movePermuteUp; - static optimizations::PermuteAsReshape permuteAsReshape; - static optimizations::OptimizeConsecutiveReshapes optimizeConsecutiveReshapes; - - // Set optimizations in desired order - m_Optimizations = {&squashEqualPermuteSiblings, - &squashEqualReshapeSiblings, - &optimizeInversePermutes, - &movePermuteUp, - &permuteAsReshape, - &optimizeConsecutiveReshapes, - }; } -void Optimizer::Optimize(Graph& graph) +void Optimizer::Pass(Graph& graph, const Optimizations& optimizations) { - Optimizer optimizer; + // Create observables to observe changes to the graph + AddedLayerObservable addedLayerObservable(graph); + ErasedLayerNamesObservable erasedLayerNamesObservable(graph); + + bool graphNeedsSorting = false; auto it = graph.TopologicalSort().end(); - // Call TopologicalSort() in every iteration to re-order the list in case layers where added/removed. + + // Calls TopologicalSort() for every iteration to re-order the list in case layers were added/removed. while (it != graph.TopologicalSort().begin()) { --it; - for (auto&& optimization : optimizer.m_Optimizations) + for (auto&& optimization : optimizations) { optimization->Run(graph, **it); if ((*it)->IsOutputUnconnected()) { it = graph.EraseLayer(it); + graphNeedsSorting = true; + } + + // Add the names of erased layers as related layers to the new added layers + for (auto& erasedLayerName : erasedLayerNamesObservable) + { + for (auto& addedLayer : addedLayerObservable) + { + addedLayer->AddRelatedLayerName(erasedLayerName); + } + } + + erasedLayerNamesObservable.Clear(); + addedLayerObservable.Clear(); + + if (graphNeedsSorting) + { + graphNeedsSorting = false; break; } } } } - } // namespace armnn diff --git a/src/armnn/Optimizer.hpp b/src/armnn/Optimizer.hpp index 1f5ed026fb..06720b040a 100644 --- a/src/armnn/Optimizer.hpp +++ b/src/armnn/Optimizer.hpp @@ -5,25 +5,48 @@ #pragma once #include <vector> +#include <memory> +#include "optimizations/All.hpp" namespace armnn { -class Graph; -class Optimization; - class Optimizer { public: + using OptimizationPtr = std::unique_ptr<Optimization>; + using Optimizations = std::vector<OptimizationPtr>; - static void Optimize(Graph& graph); + static void Pass(Graph& graph, const Optimizations& optimizations); private: ~Optimizer() = default; Optimizer(); +}; + - std::vector<Optimization*> m_Optimizations; +template<typename T> +void Append(Optimizer::Optimizations& optimizations, T&& optimization) +{ + optimizations.emplace_back(new T(optimization)); }; +template<typename Front, typename... Others> +void Append(Optimizer::Optimizations& optimizations, Front&& front, Others&&... others) +{ + Append<Front>(optimizations, std::forward<Front>(front)); + Append<Others...>(optimizations, std::forward<Others>(others)...); +}; + +template<typename... Args> +Optimizer::Optimizations MakeOptimizations(Args&&... args) +{ + Optimizer::Optimizations optimizations; + + Append(optimizations, std::forward<Args>(args)...); + + return optimizations; +} + } // namespace armnn diff --git a/src/armnn/Profiling.cpp b/src/armnn/Profiling.cpp index 15a195e6bd..f70f6a34d1 100644 --- a/src/armnn/Profiling.cpp +++ b/src/armnn/Profiling.cpp @@ -3,8 +3,7 @@ // See LICENSE file in the project root for full license information. // #include "Profiling.hpp" - -#if ARMNN_PROFILING_ENABLED +#include "JsonPrinter.hpp" #if ARMNN_STREAMLINE_ENABLED #include <streamline_annotate.h> @@ -17,10 +16,12 @@ #include <algorithm> #include <iomanip> #include <iostream> +#include <fstream> #include <map> #include <stack> -#include <boost/algorithm/string.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/core/ignore_unused.hpp> namespace armnn { @@ -32,86 +33,128 @@ constexpr std::size_t g_ProfilingEventCountHint = 1024; // Whether profiling reports should include the sequence of events together with their timings. constexpr bool g_WriteProfilingEventSequence = true; -// Whether profiling reports should also report detailed information on events grouped by tag. -// This is used to group stats per inference (see usage of ARMNN_UPDATE_PROFILING_EVENT_TAG in -// Runtime::EnqueueWorkload). This can spam the output stream, so use carefully (or adapt -// the code to just output information for a tag of interest). -constexpr bool g_AggregateProfilingEventsByTag = false; +// Whether profiling reports should also report detailed information on events grouped by inference. +// This can spam the output stream, so use carefully (or adapt the code to just output information +// of interest). +constexpr bool g_AggregateProfilingEventsByInference = true; -// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler -// singleton is destroyed. It can be convenient for local tests. -constexpr bool g_WriteReportToStdOutOnProfilerDestruction = true; +// Whether a call to Profiler::AnalyzeEventsAndWriteResults() will be made when the Profiler is destroyed. +// It can be convenient for local tests. +constexpr bool g_WriteReportToStdOutOnProfilerDestruction = false; // Whether events denoting operations running on the GPU should force a sync before/after the event. // This is hardcoded to true for now as the profiling timings are not very useful without it. +#if ARMCOMPUTECL_ENABLED constexpr bool g_ProfilingForceGpuSync = true; +#endif + +Measurement FindMeasurement(const std::string& name, const Event* event) +{ + + BOOST_ASSERT(event != nullptr); + + // Search though the measurements. + for (const auto& measurement : event->GetMeasurements()) + { + if (measurement.m_Name == name) + { + // Measurement found. + return measurement; + } + } + + // Measurement not found. + return Measurement{ "", 0.f, Measurement::Unit::TIME_MS }; +} + +std::vector<Measurement> FindKernelMeasurements(const Event* event) +{ + BOOST_ASSERT(event != nullptr); + + std::vector<Measurement> measurements; + + // Search through the measurements. + for (const auto& measurement : event->GetMeasurements()) + { + if (measurement.m_Name.rfind("OpenClKernelTimer", 0) == 0 + || measurement.m_Name.rfind("NeonKernelTimer", 0) == 0) + { + // Measurement found. + measurements.push_back(measurement); + } + } + + return measurements; +} std::map<std::string, Profiler::ProfilingEventStats> Profiler::CalculateProfilingEventStats() const { std::map<std::string, ProfilingEventStats> nameToStatsMap; - for (auto&& event : m_EventSequence) + for (const auto& event : m_EventSequence) { - auto mapIter = nameToStatsMap.find(event.m_Label); - if (mapIter != nameToStatsMap.end()) + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, event.get()); + + double durationMs = measurement.m_Value; + auto it = nameToStatsMap.find(event->GetName()); + if (it != nameToStatsMap.end()) { - ProfilingEventStats& stats = mapIter->second; - stats.m_TotalMs += event.DurationMs(); - stats.m_MinMs = std::min(stats.m_MinMs, event.DurationMs()); - stats.m_MaxMs = std::max(stats.m_MaxMs, event.DurationMs()); + ProfilingEventStats& stats = it->second; + stats.m_TotalMs += durationMs; + stats.m_MinMs = std::min(stats.m_MinMs, durationMs); + stats.m_MaxMs = std::max(stats.m_MaxMs, durationMs); ++stats.m_Count; } else { - ProfilingEventStats stats; - stats.m_TotalMs = event.DurationMs(); - stats.m_MinMs = event.DurationMs(); - stats.m_MaxMs = event.DurationMs(); - stats.m_Count = 1; - - nameToStatsMap[event.m_Label] = stats; + nameToStatsMap.emplace(event->GetName(), ProfilingEventStats{ durationMs, durationMs, durationMs, 1 }); } } return nameToStatsMap; } -void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector<ProfilingEvent>::const_iterator first, - std::vector<ProfilingEvent>::const_iterator last, - std::ostream& outStream) const +const Event* GetEventPtr(const Event* ptr) { return ptr;} +const Event* GetEventPtr(const std::unique_ptr<Event>& ptr) {return ptr.get(); } + +template<typename ItertType> +void Profiler::AnalyzeEventSequenceAndWriteResults(ItertType first, ItertType last, std::ostream& outStream) const { - // Output event sequence, if needed + // Outputs event sequence, if needed. if (g_WriteProfilingEventSequence) { - // Make sure timestamps are output with 6 decimals, and save old settings + // Makes sure timestamps are output with 6 decimals, and save old settings. std::streamsize oldPrecision = outStream.precision(); outStream.precision(6); std::ios_base::fmtflags oldFlags = outStream.flags(); outStream.setf(std::ios::fixed); - // Output fields + // Outputs fields. outStream << "Event Sequence - Name | Duration (ms) | Start (ms) | Stop (ms) | Device" << std::endl; for (auto event = first; event != last; ++event) { - std::chrono::duration<double, std::milli> startTimeMs = event->m_StartTime.time_since_epoch(); - std::chrono::duration<double, std::milli> stopTimeMs = event->m_StopTime.time_since_epoch(); - - outStream << std::setw(50) << event->m_Label << " " - << std::setw(20) << event->DurationMs() - << std::setw(20) << startTimeMs.count() - << std::setw(20) << stopTimeMs.count() - << std::setw(20) << Profiler::Get().GetEventComputeDevice(event->m_Device) - << std::endl; + const Event* eventPtr = GetEventPtr((*event)); + double startTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_START, eventPtr).m_Value; + double stopTimeMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME_STOP, eventPtr).m_Value; + + // Find the WallClock measurement if there is one. + double durationMs = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, eventPtr).m_Value; + outStream << std::setw(50) << eventPtr->GetName() << " " + << std::setw(20) << durationMs + << std::setw(20) << startTimeMs + << std::setw(20) << stopTimeMs + << std::setw(20) << GetComputeDeviceAsCString(eventPtr->GetComputeDevice()) + << std::endl; } outStream << std::endl; - // Restore previous precision settings + // Restores previous precision settings. outStream.flags(oldFlags); outStream.precision(oldPrecision); } - // Aggregate results per event name + // Aggregates results per event name. std::map<std::string, ProfilingEventStats> nameToStatsMap = CalculateProfilingEventStats(); - // Output aggregated stats + // Outputs aggregated stats. outStream << "Event Stats - Name | Avg (ms) | Min (ms) | Max (ms) | Total (ms) | Count" << std::endl; for (const auto& pair : nameToStatsMap) { @@ -126,74 +169,236 @@ void Profiler::AnalyzeEventSequenceAndWriteResults(std::vector<ProfilingEvent>:: outStream << std::endl; } -Profiler Profiler::s_Instance; - Profiler::Profiler() - : m_EventTag(0) - , m_NestingLevel(0) - , m_EventTagUpdated(false) + : m_ProfilingEnabled(false) { m_EventSequence.reserve(g_ProfilingEventCountHint); #if ARMNN_STREAMLINE_ENABLED - // Initialise streamline annotations + // Initialises streamline annotations. ANNOTATE_SETUP; #endif } Profiler::~Profiler() { - if (g_WriteReportToStdOutOnProfilerDestruction) + if (m_ProfilingEnabled) { - AnalyzeEventsAndWriteResults(std::cout); + if (g_WriteReportToStdOutOnProfilerDestruction) + { + Print(std::cout); + } } + + // Un-register this profiler from the current thread. + ProfilerManager::GetInstance().RegisterProfiler(nullptr); } -void Profiler::BeginEvent(Compute compute, const std::string label) +bool Profiler::IsProfilingEnabled() +{ + return m_ProfilingEnabled; +} + +void Profiler::EnableProfiling(bool enableProfiling) +{ + m_ProfilingEnabled = enableProfiling; +} + +Event* Profiler::BeginEvent(Compute compute, const std::string& label, std::vector<InstrumentPtr>&& instruments) { // We need to sync just before the begin event to not include time before the period we want to time. WaitForDevice(compute); - const TimePoint timeStamp = Clock::now(); - m_ObservedMarkers.emplace(Marker{m_EventSequence.size(), label, timeStamp, compute, m_EventTag}); - m_EventSequence.emplace_back(); + Event* parent = m_Parents.empty() ? nullptr : m_Parents.top(); + m_EventSequence.push_back(std::make_unique<Event>(label, this, parent, compute, std::move(instruments))); + Event* event = m_EventSequence.back().get(); + event->Start(); #if ARMNN_STREAMLINE_ENABLED - ANNOTATE_CHANNEL_COLOR(m_NestingLevel, GetEventColor(compute), label.c_str()); + ANNOTATE_CHANNEL_COLOR(m_Parents.size(), GetEventColor(compute), label.c_str()); #endif - m_NestingLevel++; + m_Parents.push(event); + return event; } -void Profiler::EndEvent(Compute compute) +void Profiler::EndEvent(Event* event) { - // We need to sync just before the end event to include all the time of the timed period. - WaitForDevice(compute); - - const Marker& marker = m_ObservedMarkers.top(); + event->Stop(); - const TimePoint startTime = marker.m_TimeStamp; - const TimePoint stopTime = Clock::now(); + BOOST_ASSERT(!m_Parents.empty()); + BOOST_ASSERT(event == m_Parents.top()); + m_Parents.pop(); - m_EventSequence[marker.m_Id] = {std::move(marker.m_EventName), - startTime, - stopTime, - marker.m_ComputeDevice, - marker.m_Tag}; - - m_ObservedMarkers.pop(); + Event* parent = m_Parents.empty() ? nullptr : m_Parents.top(); + boost::ignore_unused(parent); + BOOST_ASSERT(event->GetParentEvent() == parent); #if ARMNN_STREAMLINE_ENABLED - ANNOTATE_CHANNEL_END(m_NestingLevel); + ANNOTATE_CHANNEL_END(m_Parents.size()); #endif +} + +int CalcLevel(const Event* eventPtr) +{ + int level=0; + while (eventPtr != nullptr) + { + eventPtr = eventPtr->GetParentEvent(); + level++; + } + return level; +} + +void Profiler::PopulateInferences(std::vector<const Event*>& outInferences, int& outBaseLevel) const +{ + outInferences.reserve(m_EventSequence.size()); + for (const auto& event : m_EventSequence) + { + const Event* eventPtrRaw = event.get(); + if (eventPtrRaw->GetName() == "EnqueueWorkload") + { + outBaseLevel = (outBaseLevel == -1) ? CalcLevel(eventPtrRaw) : outBaseLevel; + outInferences.push_back(eventPtrRaw); + } + } +} + +void Profiler::PopulateDescendants(std::map<const Event*, std::vector<const Event*>>& outDescendantsMap) const +{ + for (const auto& event : m_EventSequence) + { + const Event* eventPtrRaw = event.get(); + const Event* parent = eventPtrRaw->GetParentEvent(); + + if (!parent) + { + continue; + } + + auto it = outDescendantsMap.find(parent); + if (it == outDescendantsMap.end()) + { + outDescendantsMap.emplace(parent, std::vector<const Event*>({eventPtrRaw})); + } + else + { + it->second.push_back(eventPtrRaw); + } + } +} + +void Profiler::Print(std::ostream& outStream) const +{ + // Makes sure timestamps are output with 6 decimals, and save old settings. + std::streamsize oldPrecision = outStream.precision(); + outStream.precision(6); + std::ios_base::fmtflags oldFlags = outStream.flags(); + outStream.setf(std::ios::fixed); + JsonPrinter printer(outStream); + + // First find all the "inference" Events and print out duration measurements. + int baseLevel = -1; + std::vector<const Event*> inferences; + PopulateInferences(inferences, baseLevel); + + // Second map out descendants hierarchy + std::map<const Event*, std::vector<const Event*>> descendantsMap; + PopulateDescendants(descendantsMap); + + JsonChildObject inferenceObject{"inference_measurements"}; + JsonChildObject layerObject{"layer_measurements"}; + std::vector<JsonChildObject> workloadObjects; + std::map<unsigned int, std::vector<JsonChildObject>> workloadToKernelObjects; + + for (unsigned int inferenceIndex = 0; inferenceIndex < inferences.size(); ++inferenceIndex) + { + auto inference = inferences[inferenceIndex]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, inference); + inferenceObject.SetUnit(measurement.m_Unit); + inferenceObject.AddMeasurement(measurement.m_Value); + + auto layerEventsIt = descendantsMap.find(inference); + + // Assuming 1 Execute per inference + if (layerEventsIt != descendantsMap.end()) + { + auto layerEvent = layerEventsIt->second[0]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, layerEvent); + layerObject.SetUnit(measurement.m_Unit); + layerObject.AddMeasurement(measurement.m_Value); + + // Get Descendant Events for Execute + auto workloadEventsIt = descendantsMap.find(layerEvent); + for(unsigned int workloadIndex = 0; workloadIndex < workloadEventsIt->second.size(); ++workloadIndex) + { + auto workloadEvent = workloadEventsIt->second[workloadIndex]; + Measurement measurement = FindMeasurement(WallClockTimer::WALL_CLOCK_TIME, workloadEvent); + std::vector<Measurement> kernelMeasurements = FindKernelMeasurements(workloadEvent); + if (inferenceIndex == 0) + { + // Only add second level once, in case of multiple inferences + JsonChildObject workloadObject{workloadEvent->GetName()}; + workloadObject.SetUnit(measurement.m_Unit); + workloadObjects.push_back(workloadObject); + } + workloadObjects[workloadIndex].AddMeasurement(measurement.m_Value); + + for(unsigned int kernelIndex = 0; kernelIndex < kernelMeasurements.size(); ++kernelIndex) + { + if (inferenceIndex == 0) + { + // Only add kernel measurement once, in case of multiple inferences + JsonChildObject kernelObject{kernelMeasurements[kernelIndex].m_Name}; + kernelObject.SetUnit(kernelMeasurements[kernelIndex].m_Unit); + workloadToKernelObjects[workloadIndex].push_back(kernelObject); + + } + workloadToKernelObjects[workloadIndex][kernelIndex]. + AddMeasurement(kernelMeasurements[kernelIndex].m_Value); + } + } + } + } + + for (auto workloadToKernelPair : workloadToKernelObjects) + { + for (auto kernelObject : workloadToKernelPair.second) + { + workloadObjects[workloadToKernelPair.first].AddChild(kernelObject); + } + } - m_NestingLevel--; + for (auto workloadObject : workloadObjects) + { + layerObject.AddChild(workloadObject); + } + inferenceObject.AddChild(layerObject); + + printer.PrintHeader(); + printer.PrintArmNNHeader(); + + // print inference object, also prints child layer and kernel measurements + printer.PrintJsonChildObject(inferenceObject); + + // end of ArmNN + printer.PrintNewLine(); + printer.PrintFooter(); + + // end of main JSON object + printer.PrintNewLine(); + printer.PrintFooter(); + printer.PrintNewLine(); + + // Restores previous precision settings. + outStream.flags(oldFlags); + outStream.precision(oldPrecision); } void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const { // Stack should be empty now. - const bool saneMarkerSequence = m_ObservedMarkers.empty(); + const bool saneMarkerSequence = m_Parents.empty(); // Abort if the sequence of markers was found to have incorrect information: // The stats cannot be trusted. @@ -206,39 +411,69 @@ void Profiler::AnalyzeEventsAndWriteResults(std::ostream& outStream) const return; } - // Analyze the full sequence of events - AnalyzeEventSequenceAndWriteResults(m_EventSequence.begin(), m_EventSequence.end(), outStream); + // Analyzes the full sequence of events. + AnalyzeEventSequenceAndWriteResults(m_EventSequence.cbegin(), + m_EventSequence.cend(), + outStream); - // Aggregate events by tag if requested (spams the output stream if done for all tags) - if (m_EventTagUpdated && g_AggregateProfilingEventsByTag) + // Aggregates events by tag if requested (spams the output stream if done for all tags). + if (g_AggregateProfilingEventsByInference) { outStream << std::endl; outStream << "***" << std::endl; - outStream << "*** Per Tag Stats" << std::endl; + outStream << "*** Per Inference Stats" << std::endl; outStream << "***" << std::endl; outStream << std::endl; - for (auto iter = m_EventSequence.begin(); iter != m_EventSequence.end();) - { - const uint32_t tag = iter->m_Tag; + int baseLevel = -1; + std::vector<const Event*> inferences; + PopulateInferences(inferences, baseLevel); - // Advance iter until we find the first non-matching tag - auto tagEndIter = iter; - for (; tagEndIter != m_EventSequence.end(); ++tagEndIter) + // Second map out descendants hierarchy + std::map<const Event*, std::vector<const Event*>> descendantsMap; + PopulateDescendants(descendantsMap); + + std::function<void (const Event*, std::vector<const Event*>&)> + FindDescendantEvents = [&](const Event* eventPtr, + std::vector<const Event*>& sequence) { - if (tagEndIter->m_Tag != tag) + sequence.push_back(eventPtr); + + if (CalcLevel(eventPtr) > baseLevel+2) //We only care about levels as deep as workload executions. { - break; + return; } - } - outStream << "> Begin Tag: " << tag << std::endl; + auto children = descendantsMap.find(eventPtr); + if (children == descendantsMap.end()) + { + return; + } + + for (const Event* child : children->second) + { + return FindDescendantEvents(child, sequence); + } + }; + + // Third, find events belonging to each inference + int inferenceIdx = 0; + for (auto inference : inferences) + { + std::vector<const Event*> sequence; + + //build sequence, depth first + FindDescendantEvents(inference, sequence); + + outStream << "> Begin Inference: " << inferenceIdx << std::endl; outStream << std::endl; - AnalyzeEventSequenceAndWriteResults(iter, tagEndIter, outStream); + AnalyzeEventSequenceAndWriteResults(sequence.cbegin(), + sequence.cend(), + outStream); outStream << std::endl; - outStream << "> End Tag: " << tag << std::endl; + outStream << "> End Inference: " << inferenceIdx << std::endl; - iter = tagEndIter; + inferenceIdx++; } } } @@ -253,21 +488,6 @@ void Profiler::WaitForDevice(Compute compute) const #endif } -const char* Profiler::GetEventComputeDevice(Compute compute) const -{ - switch(compute) - { - case Compute::CpuRef: - return "CpuRef"; - case Compute::CpuAcc: - return "CpuAcc"; - case Compute::GpuAcc: - return "GpuAcc"; - default: - return "Undefined"; - } -} - std::uint32_t Profiler::GetEventColor(Compute compute) const { switch(compute) @@ -287,7 +507,24 @@ std::uint32_t Profiler::GetEventColor(Compute compute) const } } -} // namespace armnn +// The thread_local pointer to the profiler instance. +thread_local Profiler* tl_Profiler = nullptr; + +ProfilerManager& ProfilerManager::GetInstance() +{ + // Global reference to the single ProfileManager instance allowed. + static ProfilerManager s_ProfilerManager; + return s_ProfilerManager; +} + +void ProfilerManager::RegisterProfiler(Profiler* profiler) +{ + tl_Profiler = profiler; +} -#endif // ARMNN_PROFILING_ENABLED +Profiler* ProfilerManager::GetProfiler() +{ + return tl_Profiler; +} +} // namespace armnn diff --git a/src/armnn/Profiling.hpp b/src/armnn/Profiling.hpp index 88a7adff7c..33c5f46886 100644 --- a/src/armnn/Profiling.hpp +++ b/src/armnn/Profiling.hpp @@ -4,9 +4,12 @@ // #pragma once -#if ARMNN_PROFILING_ENABLED +#include "ProfilingEvent.hpp" #include "armnn/ArmNN.hpp" +#include "armnn/IProfiler.hpp" + +#include "WallClockTimer.hpp" #include <chrono> #include <iosfwd> @@ -15,82 +18,52 @@ #include <stack> #include <map> +#include <boost/core/ignore_unused.hpp> + namespace armnn { -// Clock class that uses the same timestamp function as the Mali DDK -class monotonic_clock { -public: - using duration = std::chrono::nanoseconds; - using time_point = std::chrono::time_point<monotonic_clock, duration>; - - static std::chrono::time_point<monotonic_clock, std::chrono::nanoseconds> now() noexcept - { - timespec ts; -#if defined(CLOCK_MONOTONIC_RAW) - clock_gettime(CLOCK_MONOTONIC_RAW, &ts); -#else - clock_gettime(CLOCK_MONOTONIC, &ts); -#endif - return time_point(std::chrono::nanoseconds(ts.tv_sec*1000000000 + ts.tv_nsec)); - } -}; - // Simple single-threaded profiler. // Tracks events reported by BeginEvent()/EndEvent() and outputs detailed information and stats when // Profiler::AnalyzeEventsAndWriteResults() is called. -class Profiler +class Profiler final : public IProfiler { public: + Profiler(); + ~Profiler(); + using InstrumentPtr = std::unique_ptr<Instrument>; + // Marks the beginning of a user-defined event. - // No attempt will be made to copy the name string: It must be known at compile time. - void BeginEvent(Compute compute, const std::string name); + // No attempt will be made to copy the name string: it must be known at compile time. + Event* BeginEvent(Compute compute, const std::string& name, std::vector<InstrumentPtr>&& instruments); // Marks the end of a user-defined event. - void EndEvent(Compute compute); + void EndEvent(Event* event); + + // Enables/disables profiling. + void EnableProfiling(bool enableProfiling) override; + + // Checks if profiling is enabled. + bool IsProfilingEnabled() override; // Increments the event tag, allowing grouping of events in a user-defined manner (e.g. per inference). - void UpdateEventTag() { ++m_EventTag; m_EventTagUpdated = true; } + void UpdateEventTag(); // Analyzes the tracked events and writes the results to the given output stream. // Please refer to the configuration variables in Profiling.cpp to customize the information written. - void AnalyzeEventsAndWriteResults(std::ostream& outStream) const; + void AnalyzeEventsAndWriteResults(std::ostream& outStream) const override; - // Accesses the singleton - static Profiler& Get() { return s_Instance; } + // Print stats for events in JSON Format to the given output stream. + void Print(std::ostream& outStream) const override; - // Gets a string name for a given Compute device enum - const char* GetEventComputeDevice(Compute compute) const; - - // Gets the color to render an event with, based on which device it denotes - std::uint32_t GetEventColor(Compute compute) const; - - typedef monotonic_clock Clock; - typedef std::chrono::time_point<Clock> TimePoint; + // Gets the color to render an event with, based on which device it denotes. + uint32_t GetEventColor(Compute compute) const; private: - + using EventPtr = std::unique_ptr<Event>; struct Marker { std::size_t m_Id; - const std::string m_EventName; - TimePoint m_TimeStamp; - Compute m_ComputeDevice; - std::uint32_t m_Tag; - }; - - struct ProfilingEvent - { - std::string m_Label; - TimePoint m_StartTime; - TimePoint m_StopTime; - Compute m_Device; - std::uint32_t m_Tag; - - double DurationMs() const - { - return std::chrono::duration<double>(m_StopTime - m_StartTime).count()*1000.0; - } }; struct ProfilingEventStats @@ -98,62 +71,100 @@ private: double m_TotalMs; double m_MinMs; double m_MaxMs; - std::uint32_t m_Count; + uint32_t m_Count; }; - Profiler(); - ~Profiler(); - // Waits for a compute device to finish working to guarantee correct timings. // Currently used exclusively when emitting profiling events denoting GPU work. void WaitForDevice(Compute compute) const; - void AnalyzeEventSequenceAndWriteResults(std::vector<ProfilingEvent>::const_iterator first, - std::vector<ProfilingEvent>::const_iterator last, - std::ostream& outStream) const; + template<typename EventIterType> + void AnalyzeEventSequenceAndWriteResults(EventIterType first, EventIterType last, std::ostream& outStream) const; std::map<std::string, ProfilingEventStats> CalculateProfilingEventStats() const; + void PopulateInferences(std::vector<const Event*>& outInferences, int& outBaseLevel) const; + void PopulateDescendants(std::map<const Event*, std::vector<const Event*>>& outDescendantsMap) const; - std::stack<Marker> m_ObservedMarkers; - std::vector<ProfilingEvent> m_EventSequence; - std::uint32_t m_EventTag; - std::uint32_t m_NestingLevel; - bool m_EventTagUpdated; + std::stack<Event*> m_Parents; + std::vector<EventPtr> m_EventSequence; + bool m_ProfilingEnabled; - static Profiler s_Instance; +private: + // Friend functions for unit testing, see ProfilerTests.cpp. + friend size_t GetProfilerEventSequenceSize(armnn::Profiler* profiler); }; -// Helper to easily add event markers to the codebase +// Singleton profiler manager. +// Keeps track of all the running profiler instances. +class ProfilerManager +{ +public: + // Register the given profiler as a thread local pointer. + void RegisterProfiler(Profiler* profiler); + + // Gets the thread local pointer to the profiler. + Profiler* GetProfiler(); + + // Accesses the singleton. + static ProfilerManager& GetInstance(); + +private: + // The constructor is kept private so that other instances of this class (other that the singleton's) + // can't be allocated. + ProfilerManager() {} +}; + +// Helper to easily add event markers to the codebase. class ScopedProfilingEvent { public: - ScopedProfilingEvent(Compute compute, const std::string name) - : m_Compute(compute) + using InstrumentPtr = std::unique_ptr<Instrument>; + + template<typename... Args> + ScopedProfilingEvent(Compute compute, const std::string& name, Args... args) + : m_Event(nullptr) + , m_Profiler(ProfilerManager::GetInstance().GetProfiler()) { - Profiler::Get().BeginEvent(compute, name); + if (m_Profiler && m_Profiler->IsProfilingEnabled()) + { + std::vector<InstrumentPtr> instruments(0); + instruments.reserve(sizeof...(args)); //One allocation + ConstructNextInVector(instruments, args...); + m_Event = m_Profiler->BeginEvent(compute, name, std::move(instruments)); + } } ~ScopedProfilingEvent() { - Profiler::Get().EndEvent(m_Compute); + if (m_Profiler && m_Event) + { + m_Profiler->EndEvent(m_Event); + } } private: - armnn::Compute m_Compute; -}; - -} // namespace armnn -// Allows grouping events in an user-defined manner (e.g. per inference) -#define ARMNN_UPDATE_PROFILING_EVENT_TAG() armnn::Profiler::Get().UpdateEventTag(); + void ConstructNextInVector(std::vector<InstrumentPtr>& instruments) + { + boost::ignore_unused(instruments); + } -// The event name must be known at compile time -#define ARMNN_SCOPED_PROFILING_EVENT(compute, name) armnn::ScopedProfilingEvent e_##__FILE__##__LINE__(compute, name); + template<typename Arg, typename... Args> + void ConstructNextInVector(std::vector<InstrumentPtr>& instruments, Arg arg, Args... args) + { + instruments.emplace_back(std::make_unique<Arg>(arg)); + ConstructNextInVector(instruments, args...); + } -#else + Event* m_Event; ///< Event to track + Profiler* m_Profiler; ///< Profiler used +}; -#define ARMNN_UPDATE_PROFILING_EVENT_TAG() -#define ARMNN_SCOPED_PROFILING_EVENT(compute, name) +} // namespace armnn -#endif // ARMNN_PROFILING_ENABLED +// The event name must be known at compile time +#define ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(compute, /*name,*/ ...) \ + armnn::ScopedProfilingEvent e_##__FILE__##__LINE__(compute, /*name,*/ __VA_ARGS__); +#define ARMNN_SCOPED_PROFILING_EVENT(compute, name) \ + ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(compute, name, armnn::WallClockTimer()) diff --git a/src/armnn/ProfilingEvent.cpp b/src/armnn/ProfilingEvent.cpp new file mode 100644 index 0000000000..42a44a7280 --- /dev/null +++ b/src/armnn/ProfilingEvent.cpp @@ -0,0 +1,103 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "Profiling.hpp" +#include "ProfilingEvent.hpp" + +namespace armnn +{ +Event::Event(const std::string& eventName, + Profiler* profiler, + Event* parent, + const Compute computeDevice, + std::vector<InstrumentPtr>&& instruments) + : m_EventName(eventName) + , m_Profiler(profiler) + , m_Parent(parent) + , m_ComputeDevice(computeDevice) + , m_Instruments(std::move(instruments)) +{ +} + +Event::Event(Event&& other) noexcept + : m_EventName(std::move(other.m_EventName)) + , m_Profiler(other.m_Profiler) + , m_Parent(other.m_Parent) + , m_ComputeDevice(other.m_ComputeDevice) + , m_Instruments(std::move(other.m_Instruments)) + +{ +} + +Event::~Event() noexcept +{ +} + +void Event::Start() +{ + for (auto& instrument : m_Instruments) + { + instrument->Start(); + } +} + +void Event::Stop() +{ + for (auto& instrument : m_Instruments) + { + instrument->Stop(); + } +} + +const std::vector<Measurement> Event::GetMeasurements() const +{ + std::vector<Measurement> measurements; + for (auto& instrument : m_Instruments) + { + for (auto& measurement : instrument->GetMeasurements()) + { + measurements.emplace_back(std::move(measurement)); + } + } + return measurements; +} + +const std::string& Event::GetName() const +{ + return m_EventName; +} + +const Profiler* Event::GetProfiler() const +{ + return m_Profiler; +} + +const Event* Event::GetParentEvent() const +{ + return m_Parent; +} + +Compute Event::GetComputeDevice() const +{ + return m_ComputeDevice; +} + +Event& Event::operator=(Event&& other) noexcept +{ + if (this == &other) + { + return *this; + } + + m_EventName = other.m_EventName; + m_Profiler = other.m_Profiler; + m_Parent = other.m_Parent; + m_ComputeDevice = other.m_ComputeDevice; + other.m_Profiler = nullptr; + other.m_Parent = nullptr; + return *this; +} + +} // namespace armnn diff --git a/src/armnn/ProfilingEvent.hpp b/src/armnn/ProfilingEvent.hpp new file mode 100644 index 0000000000..61a2ee99e3 --- /dev/null +++ b/src/armnn/ProfilingEvent.hpp @@ -0,0 +1,92 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <stack> +#include <vector> +#include <chrono> +#include <memory> +#include "Instrument.hpp" +#include "armnn/Types.hpp" + +namespace armnn +{ + +/// Forward declaration +class Profiler; + +/// Event class records measurements reported by BeginEvent()/EndEvent() and returns measurements when +/// Event::GetMeasurements() is called. +class Event +{ +public: + using InstrumentPtr = std::unique_ptr<Instrument>; + using Instruments = std::vector<InstrumentPtr>; + + Event(const std::string& eventName, + Profiler* profiler, + Event* parent, + const Compute computeDevice, + std::vector<InstrumentPtr>&& instrument); + + Event(const Event& other) = delete; + + /// Move Constructor + Event(Event&& other) noexcept; + + /// Destructor + ~Event() noexcept; + + /// Start the Event + void Start(); + + /// Stop the Event + void Stop(); + + /// Get the recorded measurements calculated between Start() and Stop() + /// \return Recorded measurements of the event + const std::vector<Measurement> GetMeasurements() const; + + /// Get the name of the event + /// \return Name of the event + const std::string& GetName() const; + + /// Get the pointer of the profiler associated with this event + /// \return Pointer of the profiler associated with this event + const Profiler* GetProfiler() const; + + /// Get the pointer of the parent event + /// \return Pointer of the parent event + const Event* GetParentEvent() const; + + /// Get the compute device of the event + /// \return Compute device of the event + Compute GetComputeDevice() const; + + /// Assignment operator + Event& operator=(const Event& other) = delete; + + /// Move Assignment operator + Event& operator=(Event&& other) noexcept; + +private: + /// Name of the event + std::string m_EventName; + + /// Stored associated profiler + Profiler* m_Profiler; + + /// Stores optional parent event + Event* m_Parent; + + /// Compute device + Compute m_ComputeDevice; + + /// Instruments to use + Instruments m_Instruments; +}; + +} // namespace armnn diff --git a/src/armnn/Runtime.cpp b/src/armnn/Runtime.cpp index 0ca3446e1b..7d1a9faaea 100644 --- a/src/armnn/Runtime.cpp +++ b/src/armnn/Runtime.cpp @@ -45,22 +45,32 @@ int Runtime::GenerateNetworkId() Status Runtime::LoadNetwork(NetworkId& networkIdOut, IOptimizedNetworkPtr inNetwork) { + std::string ignoredErrorMessage; + return LoadNetwork(networkIdOut, std::move(inNetwork), ignoredErrorMessage); +} + +Status Runtime::LoadNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr inNetwork, + std::string & errorMessage) +{ IOptimizedNetwork* rawNetwork = inNetwork.release(); unique_ptr<LoadedNetwork> loadedNetwork = LoadedNetwork::MakeLoadedNetwork( std::unique_ptr<OptimizedNetwork>(boost::polymorphic_downcast<OptimizedNetwork*>(rawNetwork)), - m_UseCpuRefAsFallback); + errorMessage); if (!loadedNetwork) { return Status::Failure; } - std::lock_guard<std::mutex> lockGuard(m_Mutex); - networkIdOut = GenerateNetworkId(); - // store the network - m_LoadedNetworks[networkIdOut] = std::move(loadedNetwork); + { + std::lock_guard<std::mutex> lockGuard(m_Mutex); + + // Stores the network + m_LoadedNetworks[networkIdOut] = std::move(loadedNetwork); + } return Status::Success; } @@ -70,7 +80,7 @@ Status Runtime::UnloadNetwork(NetworkId networkId) #ifdef ARMCOMPUTECL_ENABLED if (arm_compute::CLScheduler::get().context()() != NULL) { - // wait for all queued CL requests to finish before unloading the network they may be using + // Waits for all queued CL requests to finish before unloading the network they may be using. try { // Coverity fix: arm_compute::CLScheduler::sync() may throw an exception of type cl::Error. @@ -84,36 +94,55 @@ Status Runtime::UnloadNetwork(NetworkId networkId) } } #endif - std::lock_guard<std::mutex> lockGuard(m_Mutex); - if (m_LoadedNetworks.erase(networkId) == 0) { - BOOST_LOG_TRIVIAL(warning) << "WARNING: Runtime::UnloadNetwork(): " << networkId << " not found!"; - return Status::Failure; - } + std::lock_guard<std::mutex> lockGuard(m_Mutex); + + if (m_LoadedNetworks.erase(networkId) == 0) + { + BOOST_LOG_TRIVIAL(warning) << "WARNING: Runtime::UnloadNetwork(): " << networkId << " not found!"; + return Status::Failure; + } + #ifdef ARMCOMPUTECL_ENABLED - if (arm_compute::CLScheduler::get().context()() != NULL && m_LoadedNetworks.empty()) - { - // There are no loaded networks left, so clear the CL cache to free up memory - m_ClContextControl.ClearClCache(); - } + if (arm_compute::CLScheduler::get().context()() != NULL && m_LoadedNetworks.empty()) + { + // There are no loaded networks left, so clear the CL cache to free up memory + m_ClContextControl.ClearClCache(); + } #endif + } + BOOST_LOG_TRIVIAL(debug) << "Runtime::UnloadNetwork(): Unloaded network with ID: " << networkId; return Status::Success; } +const std::shared_ptr<IProfiler> Runtime::GetProfiler(NetworkId networkId) const +{ + auto it = m_LoadedNetworks.find(networkId); + if (it != m_LoadedNetworks.end()) + { + auto& loadedNetwork = it->second; + return loadedNetwork->GetProfiler(); + } + + return nullptr; +} + Runtime::Runtime(const CreationOptions& options) - : m_ClContextControl(options.m_ClTunedParameters) + : m_ClContextControl(options.m_GpuAccTunedParameters.get(), + options.m_EnableGpuProfiling) , m_NetworkIdCounter(0) { BOOST_LOG_TRIVIAL(info) << "ArmNN v" << ARMNN_VERSION << "\n"; - BOOST_LOG_TRIVIAL(info) << "Using compute device: " << options.m_DefaultComputeDevice << "\n"; - m_DeviceSpec.DefaultComputeDevice = options.m_DefaultComputeDevice; - // If useCpuRefAsFallback is false, the reference workload factory will be prevented from creating - // operation workloads, unless the default compute device is precisely the reference backend. - // This option is passed to the LoadedNetwork, which owns the workload factories. - m_UseCpuRefAsFallback = options.m_DefaultComputeDevice == Compute::CpuRef || options.m_UseCpuRefAsFallback; + m_DeviceSpec.m_SupportedComputeDevices.insert(armnn::Compute::CpuRef); + #if ARMCOMPUTECL_ENABLED + m_DeviceSpec.m_SupportedComputeDevices.insert(armnn::Compute::GpuAcc); + #endif + #if ARMCOMPUTENEON_ENABLED + m_DeviceSpec.m_SupportedComputeDevices.insert(armnn::Compute::CpuAcc); + #endif } Runtime::~Runtime() @@ -173,8 +202,8 @@ TensorInfo Runtime::GetOutputTensorInfo(NetworkId networkId, LayerBindingId laye } Status Runtime::EnqueueWorkload(NetworkId networkId, - const InputTensors& inputTensors, - const OutputTensors& outputTensors) + const InputTensors& inputTensors, + const OutputTensors& outputTensors) { LoadedNetwork* loadedNetwork = GetLoadedNetworkPtr(networkId); return loadedNetwork->EnqueueWorkload(inputTensors, outputTensors); diff --git a/src/armnn/Runtime.hpp b/src/armnn/Runtime.hpp index 3879e1dd52..151dde3588 100644 --- a/src/armnn/Runtime.hpp +++ b/src/armnn/Runtime.hpp @@ -5,6 +5,7 @@ #pragma once #include "LoadedNetwork.hpp" +#include "DeviceSpec.hpp" #include "armnn/INetwork.hpp" #include "armnn/IRuntime.hpp" #include "armnn/Tensor.hpp" @@ -19,29 +20,44 @@ namespace armnn class Runtime final : public IRuntime { public: - /// Load a complete network into the Runtime. - /// @param [out] networkIdOut Unique identifier for the network is returned in this reference. - /// @param [in] network Complete network to load into the Runtime. + /// Loads a complete network into the Runtime. + /// @param [out] networkIdOut - Unique identifier for the network is returned in this reference. + /// @param [in] network - Complete network to load into the Runtime. /// The runtime takes ownership of the network once passed in. /// @return armnn::Status virtual Status LoadNetwork(NetworkId& networkIdOut, IOptimizedNetworkPtr network) override; + /// Load a complete network into the IRuntime. + /// @param [out] networkIdOut Unique identifier for the network is returned in this reference. + /// @param [in] network Complete network to load into the IRuntime. + /// @param [out] errorMessage Error message if there were any errors. + /// The runtime takes ownership of the network once passed in. + /// @return armnn::Status + virtual Status LoadNetwork(NetworkId& networkIdOut, + IOptimizedNetworkPtr network, + std::string & errorMessage) override; + virtual TensorInfo GetInputTensorInfo(NetworkId networkId, LayerBindingId layerId) const override; virtual TensorInfo GetOutputTensorInfo(NetworkId networkId, LayerBindingId layerId) const override; - // Evaluate network using input in inputTensors, outputs filled into outputTensors + // Evaluates network using input in inputTensors, outputs filled into outputTensors. virtual Status EnqueueWorkload(NetworkId networkId, const InputTensors& inputTensors, const OutputTensors& outputTensors) override; - /// Unload a network from the Runtime. + /// Unloads a network from the Runtime. /// At the moment this only removes the network from the m_Impl->m_Network. /// This might need more work in the future to be AndroidNN compliant. /// @param [in] networkId Unique identifier for the network to be unloaded. Generated in LoadNetwork(). /// @return armnn::Status virtual Status UnloadNetwork(NetworkId networkId) override; - virtual const DeviceSpec& GetDeviceSpec() const override { return m_DeviceSpec; } + virtual const IDeviceSpec& GetDeviceSpec() const override { return m_DeviceSpec; } + + /// Gets the profiler corresponding to the given network id. + /// @param networkId The id of the network for which to get the profile. + /// @return A pointer to the requested profiler, or nullptr if not found. + virtual const std::shared_ptr<IProfiler> GetProfiler(NetworkId networkId) const override; /// Creates a runtime for workload execution. /// May throw a ClRuntimeUnavailableException if @a defaultComputeDevice requires a CL runtime but @@ -51,7 +67,7 @@ public: ~Runtime(); private: - friend void RuntimeLoadedNetworksReserve(armnn::Runtime* runtime); // see RuntimeTests.cpp + friend void RuntimeLoadedNetworksReserve(armnn::Runtime* runtime); // See RuntimeTests.cpp int GenerateNetworkId(); @@ -65,8 +81,6 @@ private: int m_NetworkIdCounter; - bool m_UseCpuRefAsFallback; - DeviceSpec m_DeviceSpec; }; diff --git a/src/armnn/Tensor.cpp b/src/armnn/Tensor.cpp index 2e04c8c617..e5d7f4b1b8 100644 --- a/src/armnn/Tensor.cpp +++ b/src/armnn/Tensor.cpp @@ -180,7 +180,7 @@ BaseTensor<MemoryType>& BaseTensor<MemoryType>::operator =(const BaseTensor<Memo return *this; } -// Explicit instantiations +// Explicit instantiations. template class BaseTensor<const void*>; template class BaseTensor<void*>; diff --git a/src/armnn/TypeUtils.hpp b/src/armnn/TypeUtils.hpp new file mode 100644 index 0000000000..2b70e28ff3 --- /dev/null +++ b/src/armnn/TypeUtils.hpp @@ -0,0 +1,40 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "armnn/Types.hpp" +#include "Half.hpp" + +namespace armnn +{ + + +template<DataType DT> +struct ResolveTypeImpl; + +template<> +struct ResolveTypeImpl<DataType::QuantisedAsymm8> +{ + using Type = uint8_t; +}; + +template <> +struct ResolveTypeImpl<DataType::Float16> +{ + using Type = Half; +}; + +template<> +struct ResolveTypeImpl<DataType::Float32> +{ + using Type = float; +}; + +template<DataType DT> +using ResolveType = typename ResolveTypeImpl<DT>::Type; + + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/Utils.cpp b/src/armnn/Utils.cpp index fbde701a2a..5dafe54d7a 100644 --- a/src/armnn/Utils.cpp +++ b/src/armnn/Utils.cpp @@ -15,7 +15,7 @@ void ConfigureLogging(bool printToStandardOutput, bool printToDebugOutput, LogSe ConfigureLogging(boost::log::core::get().get(), printToStandardOutput, printToDebugOutput, severity); } -// Default to logging completely disabled. +// Defaults to logging completely disabled. // The user of the library must enable it if they want by calling armnn::ConfigureLogging(). struct DefaultLoggingConfiguration { @@ -27,4 +27,4 @@ struct DefaultLoggingConfiguration static DefaultLoggingConfiguration g_DefaultLoggingConfiguration; -}
\ No newline at end of file +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/WallClockTimer.cpp b/src/armnn/WallClockTimer.cpp new file mode 100644 index 0000000000..93d12222f7 --- /dev/null +++ b/src/armnn/WallClockTimer.cpp @@ -0,0 +1,41 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "WallClockTimer.hpp" + +namespace armnn +{ + +const std::string WallClockTimer::WALL_CLOCK_TIME ("Wall clock time"); +const std::string WallClockTimer::WALL_CLOCK_TIME_START(WallClockTimer::WALL_CLOCK_TIME + " (Start)"); +const std::string WallClockTimer::WALL_CLOCK_TIME_STOP (WallClockTimer::WALL_CLOCK_TIME + " (Stop)"); + +const char* WallClockTimer::GetName() const +{ + return "WallClockTimer"; +} + +void WallClockTimer::Start() +{ + m_Start = clock::now(); +} + +void WallClockTimer::Stop() +{ + m_Stop = clock::now(); +} + +std::vector<Measurement> WallClockTimer::GetMeasurements() const +{ + const auto delta = std::chrono::duration<double, std::milli>(m_Stop - m_Start); + const auto startTimeMs = std::chrono::duration<double, std::milli>(m_Start.time_since_epoch()); + const auto stopTimeMs = std::chrono::duration<double, std::milli>(m_Stop.time_since_epoch()); + + return { { WALL_CLOCK_TIME, delta.count(), Measurement::Unit::TIME_MS }, + { WALL_CLOCK_TIME_START, startTimeMs.count(), Measurement::Unit::TIME_MS }, + { WALL_CLOCK_TIME_STOP, stopTimeMs.count(), Measurement::Unit::TIME_MS } }; +} + +} //namespace armnn diff --git a/src/armnn/WallClockTimer.hpp b/src/armnn/WallClockTimer.hpp new file mode 100644 index 0000000000..84b46da8a2 --- /dev/null +++ b/src/armnn/WallClockTimer.hpp @@ -0,0 +1,63 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Instrument.hpp" +#include <chrono> + +namespace armnn +{ + +// Clock class that uses the same timestamp function as the Mali DDK. +class monotonic_clock_raw { +public: + using duration = std::chrono::nanoseconds; + using time_point = std::chrono::time_point<monotonic_clock_raw, duration>; + + static std::chrono::time_point<monotonic_clock_raw, std::chrono::nanoseconds> now() noexcept + { + timespec ts; + clock_gettime(CLOCK_MONOTONIC_RAW, &ts); + return time_point(std::chrono::nanoseconds(ts.tv_sec*1000000000 + ts.tv_nsec)); + } +}; + +// Implementation of an instrument to measure elapsed wall-clock time in milliseconds. +class WallClockTimer : public Instrument +{ +public: + // Construct a Wall Clock Timer + WallClockTimer() = default; + ~WallClockTimer() = default; + + // Start the Wall clock timer + void Start() override; + + // Stop the Wall clock timer + void Stop() override; + + // Get the name of the timer + const char* GetName() const override; + + // Get the recorded measurements + std::vector<Measurement> GetMeasurements() const override; + +#if defined(CLOCK_MONOTONIC_RAW) + using clock = monotonic_clock_raw; +#else + using clock = std::chrono::steady_clock; +#endif + + static const std::string WALL_CLOCK_TIME; + static const std::string WALL_CLOCK_TIME_START; + static const std::string WALL_CLOCK_TIME_STOP; + +private: + clock::time_point m_Start; + clock::time_point m_Stop; +}; + +} //namespace armnn diff --git a/src/armnn/backends/AclBaseMemoryManager.cpp b/src/armnn/backends/AclBaseMemoryManager.cpp deleted file mode 100644 index fc796995c7..0000000000 --- a/src/armnn/backends/AclBaseMemoryManager.cpp +++ /dev/null @@ -1,32 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// -#include "AclBaseMemoryManager.hpp" - -namespace armnn -{ - -#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED -AclBaseMemoryManager::AclBaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc) -{ - // (re)create the memory manager components - m_Allocator = std::move(alloc); - m_IntraLayerLifetimeMgr = std::make_shared<arm_compute::BlobLifetimeManager>(); - m_IntraLayerPoolMgr = std::make_shared<arm_compute::PoolManager>(); - m_IntraLayerMemoryMgr = std::make_shared<arm_compute::MemoryManagerOnDemand>(m_IntraLayerLifetimeMgr, - m_IntraLayerPoolMgr); -} - -void AclBaseMemoryManager::Finalize() -{ - // Set allocator that the memory manager will use - m_IntraLayerMemoryMgr->set_allocator(m_Allocator.get()); - // Number of pools that the manager will create. This specifies how many layers you want to run in parallel - m_IntraLayerMemoryMgr->set_num_pools(1); - // Finalize the memory manager. (Validity checks, memory allocations, etc) - m_IntraLayerMemoryMgr->finalize(); -} -#endif - -} diff --git a/src/armnn/backends/AclBaseMemoryManager.hpp b/src/armnn/backends/AclBaseMemoryManager.hpp deleted file mode 100644 index 74b596fe97..0000000000 --- a/src/armnn/backends/AclBaseMemoryManager.hpp +++ /dev/null @@ -1,46 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// -#pragma once - -#include "WorkloadFactory.hpp" - -#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED -#include "arm_compute/runtime/IAllocator.h" -#include "arm_compute/runtime/BlobLifetimeManager.h" -#include "arm_compute/runtime/MemoryManagerOnDemand.h" -#include "arm_compute/runtime/PoolManager.h" - -#include <memory> -#endif - -namespace armnn -{ - -// ARM Compute Base Memory Manager -class AclBaseMemoryManager -{ -public: - - AclBaseMemoryManager() { } - virtual ~AclBaseMemoryManager() { } - -#if ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED - AclBaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc); - - void Finalize(); - - std::shared_ptr<arm_compute::MemoryManagerOnDemand>& Get() { return m_IntraLayerMemoryMgr; } - -protected: - - mutable std::unique_ptr<arm_compute::IAllocator> m_Allocator; - mutable std::shared_ptr<arm_compute::BlobLifetimeManager> m_IntraLayerLifetimeMgr; - mutable std::shared_ptr<arm_compute::PoolManager> m_IntraLayerPoolMgr; - mutable std::shared_ptr<arm_compute::MemoryManagerOnDemand> m_IntraLayerMemoryMgr; -#endif - -}; - -} //namespace armnn diff --git a/src/armnn/backends/ArmComputeTensorUtils.cpp b/src/armnn/backends/ArmComputeTensorUtils.cpp index f88ed2b4c3..8e4abaf67a 100644 --- a/src/armnn/backends/ArmComputeTensorUtils.cpp +++ b/src/armnn/backends/ArmComputeTensorUtils.cpp @@ -16,23 +16,17 @@ arm_compute::DataType GetArmComputeDataType(armnn::DataType dataType) { switch(dataType) { + case armnn::DataType::Float16: + return arm_compute::DataType::F16; case armnn::DataType::Float32: - { return arm_compute::DataType::F32; - } case armnn::DataType::QuantisedAsymm8: - { return arm_compute::DataType::QASYMM8; - } case armnn::DataType::Signed32: - { return arm_compute::DataType::S32; - } default: - { BOOST_ASSERT_MSG(false, "Unknown data type"); return arm_compute::DataType::UNKNOWN; - } } } @@ -40,15 +34,15 @@ arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& te { arm_compute::TensorShape shape; - // armnn tensors are (batch, channels, height, width) - // arm_compute tensors are (width, height, channels, batch) + // armnn tensors are (batch, channels, height, width). + // arm_compute tensors are (width, height, channels, batch). for (unsigned int i = 0; i < tensorShape.GetNumDimensions(); i++) { - // note that our dimensions are stored in the opposite order to ACL's + // Note that our dimensions are stored in the opposite order to ACL's. shape.set(tensorShape.GetNumDimensions() - i - 1, tensorShape[i]); // TensorShape::set() flattens leading ones, so that batch size 1 cannot happen. - // arm_compute tensors expect this + // arm_compute tensors expect this. } // prevent arm_compute issue where tensor is flattened to nothing @@ -80,11 +74,18 @@ arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDes using arm_compute::PoolingLayerInfo; using arm_compute::Size2D; - // Resolve ARM Compute layer parameters + // Resolve ARM Compute layer parameters. const PoolingType poolingType = ConvertPoolingAlgorithmToAclPoolingType(descriptor.m_PoolType); + + bool isGlobalPooling = (descriptor.m_StrideX==0 && descriptor.m_StrideY==0); + //use specific constructor if global pooling + if(isGlobalPooling) + { + return arm_compute::PoolingLayerInfo(poolingType); + } + const DimensionRoundingType rounding = ConvertOutputShapeRoundingToAclDimensionRoundingType( descriptor.m_OutputShapeRounding); - const PadStrideInfo padStrideInfo(descriptor.m_StrideX, descriptor.m_StrideY, descriptor.m_PadLeft, diff --git a/src/armnn/backends/ArmComputeTensorUtils.hpp b/src/armnn/backends/ArmComputeTensorUtils.hpp index 84547f9c80..81c6620a01 100644 --- a/src/armnn/backends/ArmComputeTensorUtils.hpp +++ b/src/armnn/backends/ArmComputeTensorUtils.hpp @@ -20,26 +20,26 @@ class ITensorHandle; namespace armcomputetensorutils { -/// Utility function to map an armnn::DataType to corresponding arm_compute::DataType +/// Utility function to map an armnn::DataType to corresponding arm_compute::DataType. arm_compute::DataType GetArmComputeDataType(armnn::DataType dataType); -/// Utility function used to setup an arm_compute::TensorShape object from an armnn::TensorShape +/// Utility function used to setup an arm_compute::TensorShape object from an armnn::TensorShape. arm_compute::TensorShape BuildArmComputeTensorShape(const armnn::TensorShape& tensorShape); /// Utility function used to setup an arm_compute::ITensorInfo object whose dimensions are based on the given -/// armnn::ITensorInfo +/// armnn::ITensorInfo. arm_compute::TensorInfo BuildArmComputeTensorInfo(const armnn::TensorInfo& tensorInfo); -/// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor +/// Utility function used to setup an arm_compute::PoolingLayerInfo object from an armnn::Pooling2dDescriptor. arm_compute::PoolingLayerInfo BuildArmComputePoolingLayerInfo(const Pooling2dDescriptor& descriptor); -/// Utility function to setup an arm_compute::NormalizationLayerInfo object from an armnn::NormalizationDescriptor +/// Utility function to setup an arm_compute::NormalizationLayerInfo object from an armnn::NormalizationDescriptor. arm_compute::NormalizationLayerInfo BuildArmComputeNormalizationLayerInfo(const NormalizationDescriptor& desc); -/// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector +/// Utility function used to setup an arm_compute::PermutationVector object from an armnn::PermutationVector. arm_compute::PermutationVector BuildArmComputePermutationVector(const armnn::PermutationVector& vector); -/// Utility function used to setup an arm_compute::PadStrideInfo object from an armnn layer descriptor +/// Utility function used to setup an arm_compute::PadStrideInfo object from an armnn layer descriptor. template <typename Descriptor> arm_compute::PadStrideInfo BuildArmComputePadStrideInfo(const Descriptor &descriptor) { @@ -65,6 +65,16 @@ void InitialiseArmComputeTensorEmpty(Tensor& tensor) tensor.allocator()->allocate(); } +/// Utility function to free unused tensors after a workload is configured and prepared +template <typename Tensor> +void FreeTensorIfUnused(std::unique_ptr<Tensor>& tensor) +{ + if (tensor && !tensor->is_used()) + { + tensor.reset(nullptr); + } +} + // Helper function to obtain byte offset into tensor data inline size_t GetTensorOffset(const arm_compute::ITensorInfo& info, uint32_t batchIndex, @@ -73,14 +83,14 @@ inline size_t GetTensorOffset(const arm_compute::ITensorInfo& info, uint32_t x) { arm_compute::Coordinates coords; - coords.set(3, boost::numeric_cast<int>(batchIndex)); - coords.set(2, boost::numeric_cast<int>(channelIndex)); - coords.set(1, boost::numeric_cast<int>(y)); - coords.set(0, boost::numeric_cast<int>(x)); + coords.set(3, static_cast<int>(batchIndex)); + coords.set(2, static_cast<int>(channelIndex)); + coords.set(1, static_cast<int>(y)); + coords.set(0, static_cast<int>(x)); return info.offset_element_in_bytes(coords); } -// Helper function to obtain element offset into data buffer representing tensor data (assuming no strides) +// Helper function to obtain element offset into data buffer representing tensor data (assuming no strides). inline size_t GetLinearBufferOffset(const arm_compute::ITensorInfo& info, uint32_t batchIndex, uint32_t channelIndex, @@ -88,25 +98,25 @@ inline size_t GetLinearBufferOffset(const arm_compute::ITensorInfo& info, uint32_t x) { const arm_compute::TensorShape& shape = info.tensor_shape(); - uint32_t width = boost::numeric_cast<uint32_t>(shape[0]); - uint32_t height = boost::numeric_cast<uint32_t>(shape[1]); - uint32_t numChannels = boost::numeric_cast<uint32_t>(shape[2]); + uint32_t width = static_cast<uint32_t>(shape[0]); + uint32_t height = static_cast<uint32_t>(shape[1]); + uint32_t numChannels = static_cast<uint32_t>(shape[2]); return ((batchIndex * numChannels + channelIndex) * height + y) * width + x; } template <typename T> void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData) { - // if MaxNumOfTensorDimensions is increased, this loop will need fixing + // If MaxNumOfTensorDimensions is increased, this loop will need fixing. static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyArmComputeITensorData"); { const arm_compute::ITensorInfo& info = *srcTensor.info(); const arm_compute::TensorShape& shape = info.tensor_shape(); const uint8_t* const bufferPtr = srcTensor.buffer(); - uint32_t width = boost::numeric_cast<uint32_t>(shape[0]); - uint32_t height = boost::numeric_cast<uint32_t>(shape[1]); - uint32_t numChannels = boost::numeric_cast<uint32_t>(shape[2]); - uint32_t numBatches = boost::numeric_cast<uint32_t>(shape[3]); + uint32_t width = static_cast<uint32_t>(shape[0]); + uint32_t height = static_cast<uint32_t>(shape[1]); + uint32_t numChannels = static_cast<uint32_t>(shape[2]); + uint32_t numBatches = static_cast<uint32_t>(shape[3]); for (unsigned int batchIndex = 0; batchIndex < numBatches; ++batchIndex) { @@ -114,8 +124,8 @@ void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData { for (unsigned int y = 0; y < height; ++y) { - // Copy one row from arm_compute tensor buffer to linear memory buffer - // A row is the largest contiguous region we can copy, as the tensor data may be using strides + // Copies one row from arm_compute tensor buffer to linear memory buffer. + // A row is the largest contiguous region we can copy, as the tensor data may be using strides. memcpy(dstData + GetLinearBufferOffset(info, batchIndex, channelIndex, y, 0), bufferPtr + GetTensorOffset(info, batchIndex, channelIndex, y, 0), width * sizeof(T)); @@ -128,16 +138,16 @@ void CopyArmComputeITensorData(const arm_compute::ITensor& srcTensor, T* dstData template <typename T> void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor) { - // if MaxNumOfTensorDimensions is increased, this loop will need fixing + // If MaxNumOfTensorDimensions is increased, this loop will need fixing. static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyArmComputeITensorData"); { const arm_compute::ITensorInfo& info = *dstTensor.info(); const arm_compute::TensorShape& shape = info.tensor_shape(); uint8_t* const bufferPtr = dstTensor.buffer(); - uint32_t width = boost::numeric_cast<uint32_t>(shape[0]); - uint32_t height = boost::numeric_cast<uint32_t>(shape[1]); - uint32_t numChannels = boost::numeric_cast<uint32_t>(shape[2]); - uint32_t numBatches = boost::numeric_cast<uint32_t>(shape[3]); + uint32_t width = static_cast<uint32_t>(shape[0]); + uint32_t height = static_cast<uint32_t>(shape[1]); + uint32_t numChannels = static_cast<uint32_t>(shape[2]); + uint32_t numBatches = static_cast<uint32_t>(shape[3]); for (unsigned int batchIndex = 0; batchIndex < numBatches; ++batchIndex) { @@ -145,8 +155,8 @@ void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor { for (unsigned int y = 0; y < height; ++y) { - // Copy one row from linear memory buffer to arm_compute tensor buffer - // A row is the largest contiguous region we can copy, as the tensor data may be using strides + // Copies one row from linear memory buffer to arm_compute tensor buffer. + // A row is the largest contiguous region we can copy, as the tensor data may be using strides. memcpy(bufferPtr + GetTensorOffset(info, batchIndex, channelIndex, y, 0), srcData + GetLinearBufferOffset(info, batchIndex, channelIndex, y, 0), width * sizeof(T)); @@ -156,5 +166,34 @@ void CopyArmComputeITensorData(const T* srcData, arm_compute::ITensor& dstTensor } } +/// Construct a TensorShape object from an ArmCompute object based on arm_compute::Dimensions. +/// \tparam ArmComputeType Any type that implements the Dimensions interface +/// \tparam T Shape value type +/// \param shapelike An ArmCompute object that implements the Dimensions interface +/// \param initial A default value to initialise the shape with +/// \return A TensorShape object filled from the Acl shapelike object. +template<typename ArmComputeType, typename T> +TensorShape GetTensorShape(const ArmComputeType& shapelike, T initial) +{ + std::vector<unsigned int> s(MaxNumOfTensorDimensions, initial); + for (unsigned int i=0; i < shapelike.num_dimensions(); ++i) + { + s[(shapelike.num_dimensions()-1)-i] = boost::numeric_cast<unsigned int>(shapelike[i]); + } + return TensorShape(boost::numeric_cast<unsigned int>(shapelike.num_dimensions()), s.data()); +}; + +/// Get the strides from an ACL strides object +inline TensorShape GetStrides(const arm_compute::Strides& strides) +{ + return GetTensorShape(strides, 0U); +} + +/// Get the shape from an ACL shape object +inline TensorShape GetShape(const arm_compute::TensorShape& shape) +{ + return GetTensorShape(shape, 1U); +} + } // namespace armcomputetensorutils } // namespace armnn diff --git a/src/armnn/backends/ArmComputeUtils.hpp b/src/armnn/backends/ArmComputeUtils.hpp index c451e6434b..3c57fb59b7 100644 --- a/src/armnn/backends/ArmComputeUtils.hpp +++ b/src/armnn/backends/ArmComputeUtils.hpp @@ -36,7 +36,7 @@ CreateAclNormalizationLayerInfoForL2Normalization(const armnn::TensorInfo& tenso // For the reference implementation, to make alpha_ become 1, we'd have to use alpha = normSize instead. const float alpha = 1.0f; - // Don't offset the reduction + // Don't offset the reduction. const float kappa = 0.0f; // pow(reduction, -0.5) = 1 / sqrt(reduction) @@ -53,7 +53,7 @@ ConvertActivationFunctionToAclActivationFunction(ActivationFunction armnnFunctio switch (armnnFunction) { case ActivationFunction::Linear: return AclActivationFunction::LINEAR; - // Arm compute's 'logistic' function is non-parameterized, so it is exactly a sigmoid function + // Arm compute's 'logistic' function is non-parameterized, so it is exactly a sigmoid function. case ActivationFunction::Sigmoid: return AclActivationFunction::LOGISTIC; case ActivationFunction::ReLu: return AclActivationFunction::RELU; case ActivationFunction::BoundedReLu: return AclActivationFunction::LU_BOUNDED_RELU; @@ -112,6 +112,14 @@ ConvertNormalizationAlgorithmChannelToAclNormType(NormalizationAlgorithmChannel } } +inline arm_compute::FullyConnectedLayerInfo +ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(const FullyConnectedDescriptor& fullyConnectedDesc) +{ + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = fullyConnectedDesc.m_TransposeWeightMatrix; + return fc_info; +} + } #endif // ARMCOMPUTENEON_ENABLED || ARMCOMPUTECL_ENABLED diff --git a/src/armnn/backends/ClContextControl.cpp b/src/armnn/backends/ClContextControl.cpp index f086328e55..68e878da79 100644 --- a/src/armnn/backends/ClContextControl.cpp +++ b/src/armnn/backends/ClContextControl.cpp @@ -16,6 +16,7 @@ #include <boost/format.hpp> #include <boost/log/trivial.hpp> #include <boost/polymorphic_cast.hpp> +#include <boost/core/ignore_unused.hpp> #include "LeakChecking.hpp" @@ -29,22 +30,27 @@ class Device; namespace armnn { -ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters) +ClContextControl::ClContextControl(IGpuAccTunedParameters* clTunedParameters, + bool profilingEnabled) : m_clTunedParameters(boost::polymorphic_downcast<ClTunedParameters*>(clTunedParameters)) + , m_ProfilingEnabled(profilingEnabled) { + // Ignore m_ProfilingEnabled if unused to avoid compiling problems when ArmCompute is disabled. + boost::ignore_unused(m_ProfilingEnabled); + #ifdef ARMCOMPUTECL_ENABLED try { std::vector<cl::Platform> platforms; cl::Platform::get(&platforms); - // Select default platform as the first element + // Selects default platform for the first element. cl::Platform::setDefault(platforms[0]); std::vector<cl::Device> devices; platforms[0].getDevices(CL_DEVICE_TYPE_GPU, &devices); - // Select default device as the first element + // Selects default device for the first element. cl::Device::setDefault(devices[0]); } catch (const cl::Error& clError) @@ -54,15 +60,15 @@ ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters) ) % clError.what() % clError.err())); } - // Remove the use of global CL context + // Removes the use of global CL context. cl::Context::setDefault(cl::Context{}); BOOST_ASSERT(cl::Context::getDefault()() == NULL); - // Remove the use of global CL command queue + // Removes the use of global CL command queue. cl::CommandQueue::setDefault(cl::CommandQueue{}); BOOST_ASSERT(cl::CommandQueue::getDefault()() == NULL); - // always load the OpenCL runtime + // Always load the OpenCL runtime. LoadOpenClRuntime(); #endif } @@ -70,14 +76,14 @@ ClContextControl::ClContextControl(IClTunedParameters* clTunedParameters) ClContextControl::~ClContextControl() { #ifdef ARMCOMPUTECL_ENABLED - // load the OpencCL runtime without the tuned parameters to free the memory for them + // Load the OpencCL runtime without the tuned parameters to free the memory for them. try { UnloadOpenClRuntime(); } catch (const cl::Error& clError) { - // this should not happen, it is ignored if it does + // This should not happen, it is ignored if it does. // Coverity fix: BOOST_LOG_TRIVIAL (previously used here to report the error) may throw an // exception of type std::length_error. @@ -107,23 +113,23 @@ void ClContextControl::DoLoadOpenClRuntime(bool useTunedParameters) if (arm_compute::CLScheduler::get().context()() != NULL) { - // wait for all queued CL requests to finish before reinitialising it + // Wait for all queued CL requests to finish before reinitialising it. arm_compute::CLScheduler::get().sync(); } try { arm_compute::CLKernelLibrary::get().clear_programs_cache(); - // initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no + // Initialise the scheduler with a dummy context to release the LLVM data (which only happens when there are no // context references); it is initialised again, with a proper context, later. arm_compute::CLScheduler::get().init(context, commandQueue, device); arm_compute::CLKernelLibrary::get().init(".", context, device); { // - // Here we replace the context with a new one which in - // the memory leak checks shows as an extra allocation but - // because of the scope of the leak check it doesn't count + // Here we replace the context with a new one in which + // the memory leak checks show it as an extra allocation but + // because of the scope of the leak checks, it doesn't count // the disposal of the original object. On the other hand it // does count the creation of this context which it flags // as a memory leak. By adding the following line we prevent @@ -133,24 +139,19 @@ void ClContextControl::DoLoadOpenClRuntime(bool useTunedParameters) context = cl::Context(device); } - bool enableProfiling = false; -#if ARMNN_PROFILING_ENABLED - enableProfiling = true; -#endif - if (useTunedParameters && - m_clTunedParameters && m_clTunedParameters->m_Mode == IClTunedParameters::Mode::UpdateTunedParameters) - { - enableProfiling = true; // Needed for the CLTuner to work. - } + // NOTE: In this specific case profiling has to be enabled on the command queue + // in order for the CLTuner to work. + bool profilingNeededForClTuner = useTunedParameters && m_clTunedParameters && + m_clTunedParameters->m_Mode == IGpuAccTunedParameters::Mode::UpdateTunedParameters; - if (enableProfiling) + if (m_ProfilingEnabled || profilingNeededForClTuner) { - // Create a new queue with profiling enabled + // Create a new queue with profiling enabled. commandQueue = cl::CommandQueue(context, device, CL_QUEUE_PROFILING_ENABLE); } else { - // Use default queue + // Use default queue. commandQueue = cl::CommandQueue(context, device); } } @@ -178,22 +179,22 @@ void ClContextControl::ClearClCache() DoLoadOpenClRuntime(true); } -armnn::IClTunedParameters* IClTunedParameters::CreateRaw(armnn::IClTunedParameters::Mode mode) +armnn::IGpuAccTunedParameters* IGpuAccTunedParameters::CreateRaw(armnn::IGpuAccTunedParameters::Mode mode) { return new ClTunedParameters(mode); } -armnn::IClTunedParametersPtr IClTunedParameters::Create(armnn::IClTunedParameters::Mode mode) +armnn::IGpuAccTunedParametersPtr IGpuAccTunedParameters::Create(armnn::IGpuAccTunedParameters::Mode mode) { - return IClTunedParametersPtr(CreateRaw(mode), &IClTunedParameters::Destroy); + return IGpuAccTunedParametersPtr(CreateRaw(mode), &IGpuAccTunedParameters::Destroy); } -void IClTunedParameters::Destroy(IClTunedParameters* params) +void IGpuAccTunedParameters::Destroy(IGpuAccTunedParameters* params) { delete params; } -ClTunedParameters::ClTunedParameters(armnn::IClTunedParameters::Mode mode) +ClTunedParameters::ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode) : m_Mode(mode) #ifdef ARMCOMPUTECL_ENABLED , m_Tuner(mode == ClTunedParameters::Mode::UpdateTunedParameters) diff --git a/src/armnn/backends/ClContextControl.hpp b/src/armnn/backends/ClContextControl.hpp index 8098e30b75..ee1b797055 100644 --- a/src/armnn/backends/ClContextControl.hpp +++ b/src/armnn/backends/ClContextControl.hpp @@ -13,15 +13,16 @@ namespace armnn { -class IClTunedParameters; +class IGpuAccTunedParameters; class ClTunedParameters; -// ARM Compute OpenCL context control +// ARM Compute OpenCL context control. class ClContextControl { public: - ClContextControl(IClTunedParameters* clTunedParameters = nullptr); + ClContextControl(IGpuAccTunedParameters* clTunedParameters = nullptr, + bool profilingEnabled = false); virtual ~ClContextControl(); @@ -31,7 +32,7 @@ public: // to release the cached memory used by the compute library. void UnloadOpenClRuntime(); - // Clear the CL cache, without losing the tuned parameter settings + // Clear the CL cache, without losing the tuned parameter settings. void ClearClCache(); private: @@ -40,12 +41,13 @@ private: ClTunedParameters* m_clTunedParameters; + bool m_ProfilingEnabled; }; -class ClTunedParameters : public IClTunedParameters +class ClTunedParameters : public IGpuAccTunedParameters { public: - ClTunedParameters(armnn::IClTunedParameters::Mode mode); + ClTunedParameters(armnn::IGpuAccTunedParameters::Mode mode); virtual void Load(const char* filename); virtual void Save(const char* filename) const; diff --git a/src/armnn/backends/ClLayerSupport.cpp b/src/armnn/backends/ClLayerSupport.cpp index 8905adf1fc..72594ac82b 100644 --- a/src/armnn/backends/ClLayerSupport.cpp +++ b/src/armnn/backends/ClLayerSupport.cpp @@ -7,7 +7,6 @@ #include "ClLayerSupport.hpp" #include "InternalTypes.hpp" - #include <armnn/Descriptors.hpp> #include <armnn/Types.hpp> #include <armnn/Tensor.hpp> @@ -16,10 +15,21 @@ #ifdef ARMCOMPUTECL_ENABLED #include "ClWorkloads/ClAdditionFloat32Workload.hpp" +#include "ClWorkloads/ClActivationFloat32Workload.hpp" +#include "ClWorkloads/ClBatchNormalizationFloat32Workload.hpp" + +#include "ClWorkloads/ClConvertFp16ToFp32Workload.hpp" +#include "ClWorkloads/ClConvertFp32ToFp16Workload.hpp" #include "ClWorkloads/ClConvolution2dBaseWorkload.hpp" +#include "ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp" +#include "ClWorkloads/ClL2NormalizationFloat32Workload.hpp" +#include "ClWorkloads/ClMultiplicationFloat32Workload.hpp" +#include "ClWorkloads/ClFullyConnectedFloat32Workload.hpp" #include "ClWorkloads/ClPooling2dBaseWorkload.hpp" #include "ClWorkloads/ClPermuteWorkload.hpp" #include "ClWorkloads/ClNormalizationFloat32Workload.hpp" +#include "ClWorkloads/ClSoftmaxBaseWorkload.hpp" +#include "ClWorkloads/ClLstmFloat32Workload.hpp" #endif using namespace boost; @@ -31,7 +41,7 @@ namespace template<unsigned int FilterSize> bool IsMatchingSize2d(const TensorInfo& weightInfo) { - // Width & Height must match + // Width & Height must match. return (weightInfo.GetShape()[3] == FilterSize) && (weightInfo.GetShape()[2] == FilterSize); } @@ -88,58 +98,10 @@ inline bool IsWorkloadSupported(FuncType&& func, std::string* reasonIfUnsupporte } //namespace -bool IsClActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters) -{ - if (parameters.m_Function != ActivationFunction::BoundedReLu) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Unsupported activation function, only BoundedReLu is supported"; - } - - return false; - } - - return true; -} - -bool IsClDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported, - const DepthwiseConvolution2dDescriptor& parameters, - const TensorInfo& weights) -{ - if (weights.GetNumDimensions() != 4) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Depthwise convolution Weight tensor needs to be 4d"; - } - return false; - } - // weights.GetShape()[0] = channel multiplier - if (weights.GetShape()[0] != 1) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Channel multiplier only supports the value 1 in the CL backend"; - } - return false; - } - else if ((weights.GetDataType() == armnn::DataType::QuantisedAsymm8) && !IsMatchingSize2d<3>(weights)) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "CL backend only supports 3x3 filtering for Depthwise Convolution on 8-bit"; - } - return false; - } - - return true; -} - -template<typename Float32Func, typename Uint8Func, typename ... Params> +template<typename FloatFunc, typename Uint8Func, typename ... Params> bool IsSupportedForDataTypeCl(std::string* reasonIfUnsupported, DataType dataType, - Float32Func floatFuncPtr, + FloatFunc floatFuncPtr, Uint8Func uint8FuncPtr, Params&&... params) { @@ -147,19 +109,21 @@ bool IsSupportedForDataTypeCl(std::string* reasonIfUnsupported, IsSupportedForDataTypeGeneric(reasonIfUnsupported, dataType, floatFuncPtr, + floatFuncPtr, uint8FuncPtr, std::forward<Params>(params)...); } bool IsActivationSupportedCl(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<const ActivationDescriptor&>, - &IsClActivationUint8Supported, - descriptor); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClActivationWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor); } bool IsAdditionSupportedCl(const TensorInfo& input0, @@ -167,21 +131,30 @@ bool IsAdditionSupportedCl(const TensorInfo& input0, const TensorInfo& output, std::string* reasonIfUnsupported) { - return FORWARD_CL_LAYER_SUPPORT_FUNC(ClAdditionFloat32Workload::IsSupported(input0, + return FORWARD_CL_LAYER_SUPPORT_FUNC(ClAdditionValidate(input0, input1, output, reasonIfUnsupported)); } bool IsBatchNormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<const BatchNormalizationDescriptor&>, - &FalseFuncU8<const BatchNormalizationDescriptor&>, - descriptor); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClBatchNormalizationValidate, + reasonIfUnsupported, + input, + output, + mean, + var, + beta, + gamma, + descriptor); } bool IsConstantSupportedCl(const TensorInfo& output, @@ -206,20 +179,20 @@ bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convol bool strideIsOneOrTwo = strideXIsOneOrTwo && strideYIsOneOrTwo; bool strideIsOneOrTwoOrThree = ( strideXIsOneOrTwo || strideXIsThree ) && ( strideYIsOneOrTwo || strideYIsThree ); - // 1x1 convolution with strides of 1,2,3 + // 1x1 convolution with strides of 1,2,3. isSupported |= IsMatchingSize2d<1>(weightInfo) && ( strideIsOneOrTwoOrThree ); - // 3x3 convolution with strides of 1,2 + // 3x3 convolution with strides of 1,2. isSupported |= IsMatchingSize2d<3>(weightInfo) && ( strideIsOneOrTwo ); // 5x5 convolution with strides of 1,2 isSupported |= IsMatchingSize2d<5>(weightInfo) && ( strideIsOneOrTwo ); - //fall back to normal convolution for the asymmetric padding case. + //Fall back to normal convolution for the asymmetric padding case. if (desc.m_PadLeft != desc.m_PadRight || desc.m_PadTop != desc.m_PadBottom) { - //direct convolution does not support asymmetric padding yet. + //Direct convolution does not support asymmetric padding yet. isSupported = false; } @@ -250,27 +223,40 @@ bool IsConvolution2dSupportedCl(const TensorInfo& input, } bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &IsClDepthwiseConvolution2dDescParamsSupported, - &IsClDepthwiseConvolution2dDescParamsSupported, - descriptor, - weights); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClDepthwiseConvolutionWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor, + weights, + biases); } bool IsFullyConnectedSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + // At the moment U8 is unsupported + if (input.GetDataType() == DataType::QuantisedAsymm8) + { + return false; + } + FORWARD_WORKLOAD_VALIDATE_FUNC(ClFullyConnectedWorkloadValidate, + reasonIfUnsupported, + input, + output, + weights, + biases, + descriptor); } bool IsInputSupportedCl(const TensorInfo& input, @@ -283,12 +269,10 @@ bool IsInputSupportedCl(const TensorInfo& input, } bool IsL2NormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output); } bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs, @@ -304,13 +288,14 @@ bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs, bool IsMultiplicationSupportedCl(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported) { - ignore_unused(input1); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input0.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClMultiplicationWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); } bool IsNormalizationSupportedCl(const TensorInfo& input, @@ -358,14 +343,12 @@ bool IsResizeBilinearSupportedCl(const TensorInfo& input, } bool IsSoftmaxSupportedCl(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported) { ignore_unused(descriptor); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &TrueFunc<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(ClSoftmaxWorkloadValidate, reasonIfUnsupported, input, output); } bool IsSplitterSupportedCl(const TensorInfo& input, @@ -400,10 +383,59 @@ bool IsFloorSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported) { ignore_unused(output); - return IsSupportedForDataTypeCl(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + return IsClBackendSupported(reasonIfUnsupported) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &FalseFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>); +} + +bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported) +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClLstmFloat32WorkloadValidate, reasonIfUnsupported, + input, outputStateIn, cellStateIn, scratchBuffer, outputStateOut, cellStateOut, + output, descriptor, inputToForgetWeights, inputToCellWeights, + inputToOutputWeights, recurrentToForgetWeights, + recurrentToCellWeights, recurrentToOutputWeights, + forgetGateBias, cellBias, outputGateBias, + inputToInputWeights, recurrentToInputWeights, + cellToInputWeights, inputGateBias, projectionWeights, + projectionBias, cellToForgetWeights, cellToOutputWeights); +} + +bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp16ToFp32WorkloadValidate, + reasonIfUnsupported, + input, + output, + reasonIfUnsupported); +} + +bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + FORWARD_WORKLOAD_VALIDATE_FUNC(ClConvertFp32ToFp16WorkloadValidate, + reasonIfUnsupported, + input, + output, + reasonIfUnsupported); } } diff --git a/src/armnn/backends/ClLayerSupport.hpp b/src/armnn/backends/ClLayerSupport.hpp index 4f71e907cf..791e904616 100644 --- a/src/armnn/backends/ClLayerSupport.hpp +++ b/src/armnn/backends/ClLayerSupport.hpp @@ -7,16 +7,17 @@ #include <armnn/DescriptorsFwd.hpp> #include <armnn/Types.hpp> #include <armnn/Tensor.hpp> +#include <armnn/ArmNN.hpp> namespace armnn { bool IsClDirectConvolution2dSupported(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc); -bool IsClActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters); bool IsClDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported, const DepthwiseConvolution2dDescriptor& parameters, const TensorInfo& weights); bool IsActivationSupportedCl(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -26,6 +27,11 @@ bool IsAdditionSupportedCl(const TensorInfo& input0, std::string* reasonIfUnsupported = nullptr); bool IsBatchNormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -40,11 +46,16 @@ bool IsConvolution2dSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsDepthwiseConvolutionSupportedCl(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); bool IsFullyConnectedSupportedCl(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -52,14 +63,30 @@ bool IsInputSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsL2NormalizationSupportedCl(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsLstmSupportedCl(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr); + bool IsMergerSupportedCl(const std::vector<const TensorInfo*> inputs, const OriginsDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); bool IsMultiplicationSupportedCl(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsNormalizationSupportedCl(const TensorInfo& input, @@ -84,6 +111,7 @@ bool IsResizeBilinearSupportedCl(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsSoftmaxSupportedCl(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -101,4 +129,13 @@ bool IsReshapeSupportedCl(const TensorInfo& input, bool IsFloorSupportedCl(const TensorInfo& input, const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp16ToFp32SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp32ToFp16SupportedCl(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + } diff --git a/src/armnn/backends/ClTensorHandle.hpp b/src/armnn/backends/ClTensorHandle.hpp index 49e18dad59..e3618a3c46 100644 --- a/src/armnn/backends/ClTensorHandle.hpp +++ b/src/armnn/backends/ClTensorHandle.hpp @@ -9,9 +9,12 @@ #include <arm_compute/runtime/CL/CLTensor.h> #include <arm_compute/runtime/CL/CLSubTensor.h> +#include <arm_compute/runtime/CL/CLMemoryGroup.h> +#include <arm_compute/runtime/IMemoryGroup.h> #include <arm_compute/core/TensorShape.h> #include <arm_compute/core/Coordinates.h> +#include <boost/polymorphic_pointer_cast.hpp> namespace armnn { @@ -22,9 +25,8 @@ class IClTensorHandle : public ITensorHandle public: virtual arm_compute::ICLTensor& GetTensor() = 0; virtual arm_compute::ICLTensor const& GetTensor() const = 0; - virtual void Map(bool blocking = true) = 0; - virtual void UnMap() = 0; virtual arm_compute::DataType GetDataType() const = 0; + virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) = 0; }; class ClTensorHandle : public IClTensorHandle @@ -37,50 +39,98 @@ public: arm_compute::CLTensor& GetTensor() override { return m_Tensor; } arm_compute::CLTensor const& GetTensor() const override { return m_Tensor; } - virtual void Allocate() override {armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);}; + virtual void Allocate() override {armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor);} - virtual void Map(bool blocking = true) override {m_Tensor.map(blocking);} - virtual void UnMap() override { m_Tensor.unmap();} + virtual void Manage() override + { + assert(m_MemoryGroup != nullptr); + m_MemoryGroup->manage(&m_Tensor); + } - virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL;} + virtual const void* Map(bool blocking = true) const override + { + const_cast<arm_compute::CLTensor*>(&m_Tensor)->map(blocking); + return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override { const_cast<arm_compute::CLTensor*>(&m_Tensor)->unmap(); } + + virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; } + + virtual ITensorHandle* GetParent() const override { return nullptr; } virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override + { + m_MemoryGroup = boost::polymorphic_pointer_downcast<arm_compute::CLMemoryGroup>(memoryGroup); + } + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } private: arm_compute::CLTensor m_Tensor; - + std::shared_ptr<arm_compute::CLMemoryGroup> m_MemoryGroup; }; class ClSubTensorHandle : public IClTensorHandle { public: - ClSubTensorHandle(arm_compute::ICLTensor& parent, - const arm_compute::TensorShape& shape, - const arm_compute::Coordinates& coords) - : m_Tensor(&parent, shape, coords) + ClSubTensorHandle(IClTensorHandle* parent, + const arm_compute::TensorShape& shape, + const arm_compute::Coordinates& coords) + : m_Tensor(&parent->GetTensor(), shape, coords) { + parentHandle = parent; } arm_compute::CLSubTensor& GetTensor() override { return m_Tensor; } arm_compute::CLSubTensor const& GetTensor() const override { return m_Tensor; } - virtual void Allocate() override {}; - virtual void Map(bool blocking = true) override {m_Tensor.map(blocking);} - virtual void UnMap() override { m_Tensor.unmap();} + virtual void Allocate() override {} + virtual void Manage() override {} - virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL;} + virtual const void* Map(bool blocking = true) const override + { + const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->map(blocking); + return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override { const_cast<arm_compute::CLSubTensor*>(&m_Tensor)->unmap(); } + + virtual ITensorHandle::Type GetType() const override { return ITensorHandle::CL; } + + virtual ITensorHandle* GetParent() const override { return parentHandle; } virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {} + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + private: - arm_compute::CLSubTensor m_Tensor; + mutable arm_compute::CLSubTensor m_Tensor; + ITensorHandle* parentHandle = nullptr; }; -}
\ No newline at end of file +} diff --git a/src/armnn/backends/ClWorkloadFactory.cpp b/src/armnn/backends/ClWorkloadFactory.cpp index 916ca46aae..354440c7bc 100644 --- a/src/armnn/backends/ClWorkloadFactory.cpp +++ b/src/armnn/backends/ClWorkloadFactory.cpp @@ -15,9 +15,13 @@ #include <arm_compute/core/CL/CLKernelLibrary.h> #include <arm_compute/runtime/CL/CLBufferAllocator.h> #include <arm_compute/runtime/CL/CLScheduler.h> + +#include "ClWorkloads.hpp" + #include "backends/MemCopyWorkload.hpp" #include "backends/ClTensorHandle.hpp" -#include "ClWorkloads.hpp" + +#include "memory/IPoolManager.hpp" #endif #include "MakeWorkloadHelper.hpp" @@ -29,7 +33,9 @@ namespace armnn { -bool ClWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool ClWorkloadFactory::IsLayerSupported(const Layer& layer, + boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported) { return IWorkloadFactory::IsLayerSupported(Compute::GpuAcc, layer, dataType, outReasonIfUnsupported); } @@ -43,7 +49,10 @@ ClWorkloadFactory::ClWorkloadFactory() std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const { - return std::make_unique<ClTensorHandle>(tensorInfo); + std::unique_ptr<ClTensorHandle> tensorHandle = std::make_unique<ClTensorHandle>(tensorInfo); + tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup()); + + return tensorHandle; } std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateSubTensorHandle(ITensorHandle& parent, @@ -58,24 +67,25 @@ std::unique_ptr<ITensorHandle> ClWorkloadFactory::CreateSubTensorHandle(ITensorH coords.set_num_dimensions(subTensorShape.GetNumDimensions()); for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++) { - // arm compute indexes tensor coords in reverse order + // Arm compute indexes tensor coords in reverse order. unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1; coords.set(i, boost::numeric_cast<int>(subTensorOrigin[revertedIndex])); } - return std::make_unique<ClSubTensorHandle>(static_cast<ClTensorHandle&>(parent).GetTensor(), shape, coords); + return std::make_unique<ClSubTensorHandle>( + boost::polymorphic_downcast<IClTensorHandle*>(&parent), shape, coords); } std::unique_ptr<IWorkload> ClWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<CopyFromCpuToClFloat32Workload, CopyFromCpuToClUint8Workload>(descriptor, info); + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<IWorkload> ClWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<CopyFromClToCpuFloat32Workload, CopyFromClToCpuUint8Workload>(descriptor, info); + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<IWorkload> ClWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor, @@ -87,7 +97,8 @@ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateActivation(const ActivationQ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSoftmax(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<ClSoftmaxFloat32Workload, ClSoftmaxUint8Workload>(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload<ClSoftmaxFloat32Workload, ClSoftmaxUint8Workload>(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<IWorkload> ClWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor, @@ -105,13 +116,14 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMerger(const MergerQu std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateFullyConnected( const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<ClFullyConnectedFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload<ClFullyConnectedFloat32Workload, NullWorkload>(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<ClPermuteFloat32Workload, ClPermuteUint8Workload>(descriptor, info); + return MakeWorkload<ClPermuteFloatWorkload, ClPermuteUint8Workload>(descriptor, info); } std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor, @@ -124,7 +136,7 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateConvolution2d(const C const WorkloadInfo& info) const { return MakeWorkload<ClConvolution2dFloat32Workload, ClConvolution2dUint8Workload>(descriptor, info, - m_MemoryManager.Get()); + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<IWorkload> ClWorkloadFactory::CreateDepthwiseConvolution2d( @@ -142,7 +154,7 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateNormalization(const N std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<ClAdditionFloat32Workload, NullWorkload>(descriptor, info); + return MakeWorkload<ClAdditionFloat32Workload, ClAdditionUint8Workload>(descriptor, info); } std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMultiplication( @@ -165,21 +177,7 @@ std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateMemCopy(const MemCopy throw InvalidArgumentException("ClWorkloadFactory: Invalid null input for MemCopy workload"); } - // Create a workload that will copy tensor data from the inputs, which can have a number of different formats, - // to CL tensors. - switch (descriptor.m_Inputs[0]->GetType()) - { - case ITensorHandle::Cpu: - return MakeWorkload<CopyFromCpuToClFloat32Workload, CopyFromCpuToClUint8Workload>(descriptor, info); -#if ARMCOMPUTENEON_ENABLED - case ITensorHandle::Neon: - { - return MakeWorkload<CopyFromNeonToClFloat32Workload, CopyFromNeonToClUint8Workload>(descriptor, info); - } -#endif - default: - throw InvalidArgumentException("ClWorkloadFactory: Destination type not supported for MemCopy Workload."); - } + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<armnn::IWorkload> ClWorkloadFactory::CreateResizeBilinear( @@ -220,11 +218,41 @@ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFloor(const FloorQueueDescri return MakeWorkload<ClFloorFloat32Workload, NullWorkload>(descriptor, info); } +std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return MakeWorkload<ClLstmFloat32Workload, NullWorkload>(descriptor, info); +} + +std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique<ClConvertFp16ToFp32Workload>(descriptor, info); +} + +std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique<ClConvertFp32ToFp16Workload>(descriptor, info); +} + void ClWorkloadFactory::Finalize() { m_MemoryManager.Finalize(); } +void ClWorkloadFactory::Release() +{ + m_MemoryManager.Release(); +} + +void ClWorkloadFactory::Acquire() +{ + m_MemoryManager.Acquire(); +} + #else // #if ARMCOMPUTECL_ENABLED ClWorkloadFactory::ClWorkloadFactory() @@ -375,10 +403,38 @@ std::unique_ptr<IWorkload> ClWorkloadFactory::CreateFloor(const FloorQueueDescri return nullptr; } +std::unique_ptr<IWorkload> ClWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr<IWorkload> ClWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + void ClWorkloadFactory::Finalize() { } +void ClWorkloadFactory::Release() +{ +} + +void ClWorkloadFactory::Acquire() +{ +} + #endif // #if ARMCOMPUTECL_ENABLED } // namespace armnn diff --git a/src/armnn/backends/ClWorkloadFactory.hpp b/src/armnn/backends/ClWorkloadFactory.hpp index 7365fe9aeb..d0786f3fba 100644 --- a/src/armnn/backends/ClWorkloadFactory.hpp +++ b/src/armnn/backends/ClWorkloadFactory.hpp @@ -4,14 +4,17 @@ // #pragma once -#include "AclBaseMemoryManager.hpp" #include "OutputHandler.hpp" + #include "armnn/IRuntime.hpp" +#include <boost/optional.hpp> + +#include "memory/BaseMemoryManager.hpp" namespace armnn { -// ARM Compute OpenCL workload factory +// ARM Compute OpenCL workload factory. class ClWorkloadFactory : public IWorkloadFactory { public: @@ -19,7 +22,8 @@ public: virtual Compute GetCompute() const override { return Compute::GpuAcc; } - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const override { return true; } @@ -95,11 +99,26 @@ public: virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const override; - void Finalize() override; + virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual void Finalize() override; + + virtual void Release() override; + + virtual void Acquire() override; private: - mutable AclBaseMemoryManager m_MemoryManager; +#ifdef ARMCOMPUTECL_ENABLED + mutable ClMemoryManager m_MemoryManager; +#endif }; } // namespace armnn diff --git a/src/armnn/backends/ClWorkloadUtils.hpp b/src/armnn/backends/ClWorkloadUtils.hpp index 549a0bbc25..6b6a18e865 100644 --- a/src/armnn/backends/ClWorkloadUtils.hpp +++ b/src/armnn/backends/ClWorkloadUtils.hpp @@ -9,6 +9,15 @@ #include <arm_compute/runtime/CL/CLFunctions.h> #include <arm_compute/runtime/SubTensor.h> #include "ArmComputeTensorUtils.hpp" +#include "OpenClTimer.hpp" +#include "CpuTensorHandle.hpp" +#include "Half.hpp" + +#define ARMNN_SCOPED_PROFILING_EVENT_CL(name) \ + ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::GpuAcc, \ + name, \ + armnn::OpenClTimer(), \ + armnn::WallClockTimer()) namespace armnn { @@ -17,12 +26,12 @@ template <typename T> void CopyArmComputeClTensorData(const T* srcData, arm_compute::CLTensor& dstTensor) { { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "MapClTensorForWriting"); + ARMNN_SCOPED_PROFILING_EVENT_CL("MapClTensorForWriting"); dstTensor.map(true); } { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyToClTensor"); + ARMNN_SCOPED_PROFILING_EVENT_CL("CopyToClTensor"); armcomputetensorutils::CopyArmComputeITensorData<T>(srcData, dstTensor); } @@ -36,4 +45,21 @@ void InitialiseArmComputeClTensorData(arm_compute::CLTensor& clTensor, const T* CopyArmComputeClTensorData<T>(data, clTensor); } +inline void InitializeArmComputeClTensorDataForFloatTypes(arm_compute::CLTensor& clTensor, + const ConstCpuTensorHandle *handle) +{ + BOOST_ASSERT(handle); + switch(handle->GetTensorInfo().GetDataType()) + { + case DataType::Float16: + InitialiseArmComputeClTensorData(clTensor, handle->GetConstTensor<armnn::Half>()); + break; + case DataType::Float32: + InitialiseArmComputeClTensorData(clTensor, handle->GetConstTensor<float>()); + break; + default: + BOOST_ASSERT_MSG(false, "Unexpected floating point type."); + } +}; + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads.hpp b/src/armnn/backends/ClWorkloads.hpp index 3b8cf50ace..9f5622a491 100644 --- a/src/armnn/backends/ClWorkloads.hpp +++ b/src/armnn/backends/ClWorkloads.hpp @@ -7,6 +7,7 @@ #include "backends/ClWorkloads/ClActivationFloat32Workload.hpp" #include "backends/ClWorkloads/ClActivationUint8Workload.hpp" #include "backends/ClWorkloads/ClAdditionFloat32Workload.hpp" +#include "backends/ClWorkloads/ClAdditionUint8Workload.hpp" #include "backends/ClWorkloads/ClBaseConstantWorkload.hpp" #include "backends/ClWorkloads/ClBaseMergerWorkload.hpp" #include "backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp" @@ -19,6 +20,7 @@ #include "backends/ClWorkloads/ClFloorFloat32Workload.hpp" #include "backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp" #include "backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp" +#include "backends/ClWorkloads/ClLstmFloat32Workload.hpp" #include "backends/ClWorkloads/ClMergerFloat32Workload.hpp" #include "backends/ClWorkloads/ClMergerUint8Workload.hpp" #include "backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp" @@ -32,4 +34,6 @@ #include "backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp" #include "backends/ClWorkloads/ClSoftmaxUint8Workload.hpp" #include "backends/ClWorkloads/ClSplitterFloat32Workload.hpp" -#include "backends/ClWorkloads/ClSplitterUint8Workload.hpp"
\ No newline at end of file +#include "backends/ClWorkloads/ClSplitterUint8Workload.hpp" +#include "backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp" +#include "backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp" diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp index fb5d78425e..f072549cbc 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.cpp @@ -9,10 +9,31 @@ namespace armnn { +arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + const arm_compute::ActivationLayerInfo activationLayerInfo = + ConvertActivationDescriptorToAclActivationLayerInfo(descriptor); + + if (input.GetDataType() == DataType::QuantisedAsymm8 && + activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, + "CL: Logistic Activations unsupported with QAsymm8 data type."}; + } + + return arm_compute::CLActivationLayer::validate(&aclInput, + &aclOutput, + activationLayerInfo); +} ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ActivationQueueDescriptor>(descriptor, info) + : FloatWorkload<ActivationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClActivationFloat32Workload", 1, 1); @@ -26,7 +47,7 @@ ClActivationFloat32Workload::ClActivationFloat32Workload(const ActivationQueueDe void ClActivationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationFloat32Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp index 9bab4202be..9fbfe95856 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClActivationFloat32Workload.hpp @@ -9,9 +9,12 @@ namespace armnn { +arm_compute::Status ClActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor); -// Activation layer execution -class ClActivationFloat32Workload : public Float32Workload<ActivationQueueDescriptor> +// Activation layer execution. +class ClActivationFloat32Workload : public FloatWorkload<ActivationQueueDescriptor> { public: ClActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp index 3671dd7187..75ab3d0691 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.cpp @@ -6,6 +6,7 @@ #include "ClActivationUint8Workload.hpp" #include "backends/ClLayerSupport.hpp" +#include "backends/ArmComputeUtils.hpp" #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn @@ -15,15 +16,8 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri const WorkloadInfo& info) : Uint8Workload<ActivationQueueDescriptor>(descriptor, info) { - - std::string reasonIfUnsupported; - if (!IsClActivationUint8Supported(&reasonIfUnsupported, m_Data.m_Parameters)) - { - throw InvalidArgumentException(reasonIfUnsupported); - } - - // Only BoundedReLu is supported (see IsClActivationUint8Supported) - arm_compute::ActivationLayerInfo layerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function); + arm_compute::ActivationLayerInfo layerInfo(activation, m_Data.m_Parameters.m_A, m_Data.m_Parameters.m_B); @@ -37,7 +31,7 @@ ClActivationUint8Workload::ClActivationUint8Workload(const ActivationQueueDescri void ClActivationUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClActivationUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClActivationUint8Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp index 3a9cceb298..449b2d56c5 100644 --- a/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClActivationUint8Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -// Activation layer execution +// Activation layer execution. class ClActivationUint8Workload : public Uint8Workload<ActivationQueueDescriptor> { public: diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp new file mode 100644 index 0000000000..5dd7bb323a --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.cpp @@ -0,0 +1,71 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClAdditionBaseWorkload.hpp" + +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +template <armnn::DataType... T> +ClAdditionBaseWorkload<T...>::ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor, + const WorkloadInfo& info) + : TypedWorkload<AdditionQueueDescriptor, T...>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClAdditionBaseWorkload", 2, 1); + + arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[1])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor(); + m_Layer.configure(&input0, &input1, &output, g_AclConvertPolicy); +} + +template <armnn::DataType... T> +void ClAdditionBaseWorkload<T...>::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionBaseWorkload_Execute"); + m_Layer.run(); +} + +bool ClAdditionValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input0.GetDataType() == DataType::QuantisedAsymm8) + { + // Reject quantised addition for the moment (COMPMID-1385) + *reasonIfUnsupported = "Quantised Addition not yet supported"; + return false; + } + + const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLArithmeticAddition::validate(&aclInput0Info, + &aclInput1Info, + &aclOutputInfo, + g_AclConvertPolicy); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return supported; +} + +} //namespace armnn + +template class armnn::ClAdditionBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>; +template class armnn::ClAdditionBaseWorkload<armnn::DataType::QuantisedAsymm8>; diff --git a/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp new file mode 100644 index 0000000000..fba8a0d457 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionBaseWorkload.hpp @@ -0,0 +1,29 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +template <armnn::DataType... dataTypes> +class ClAdditionBaseWorkload : public TypedWorkload<AdditionQueueDescriptor, dataTypes...> +{ +public: + ClAdditionBaseWorkload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); + + void Execute() const override; + +private: + mutable arm_compute::CLArithmeticAddition m_Layer; +}; + +bool ClAdditionValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output, + std::string* reasonIfUnsupported); +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp index 153167f172..b69593f5f5 100644 --- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.cpp @@ -13,45 +13,10 @@ namespace armnn { using namespace armcomputetensorutils; -ClAdditionFloat32Workload::ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, - const WorkloadInfo& info) - : Float32Workload<AdditionQueueDescriptor>(descriptor, info) -{ - m_Data.ValidateInputsOutputs("ClAdditionFloat32Workload", 2, 1); - - arm_compute::ICLTensor& input0 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ICLTensor& input1 = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor(); - arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input0, &input1, &output, ms_AclConvertPolicy); -} - void ClAdditionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClAdditionFloat32Workload_Execute"); - m_Layer.run(); -} - -bool ClAdditionFloat32Workload::IsSupported(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output, - std::string* reasonIfUnsupported) -{ - const arm_compute::TensorInfo aclInput0Info = BuildArmComputeTensorInfo(input0); - const arm_compute::TensorInfo aclInput1Info = BuildArmComputeTensorInfo(input1); - const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); - - const arm_compute::Status aclStatus = decltype(m_Layer)::validate(&aclInput0Info, - &aclInput1Info, - &aclOutputInfo, - ms_AclConvertPolicy); - - const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); - if (!supported && reasonIfUnsupported) - { - *reasonIfUnsupported = aclStatus.error_description(); - } - - return supported; + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionFloat32Workload_Execute"); + ClAdditionBaseWorkload::Execute(); } -} //namespace armnn
\ No newline at end of file +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp index 37e50c2c86..7eac485cfe 100644 --- a/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClAdditionFloat32Workload.hpp @@ -5,26 +5,16 @@ #pragma once -#include "backends/ClWorkloadUtils.hpp" +#include "ClAdditionBaseWorkload.hpp" namespace armnn { -class ClAdditionFloat32Workload : public Float32Workload<AdditionQueueDescriptor> +class ClAdditionFloat32Workload : public ClAdditionBaseWorkload<DataType::Float16, DataType::Float32> { public: - ClAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); - + using ClAdditionBaseWorkload<DataType::Float16, DataType::Float32>::ClAdditionBaseWorkload; void Execute() const override; - - static bool IsSupported(const TensorInfo& input0, - const TensorInfo& input1, - const TensorInfo& output, - std::string* reasonIfUnsupported); - -private: - mutable arm_compute::CLArithmeticAddition m_Layer; - static constexpr arm_compute::ConvertPolicy ms_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; }; -} //namespace armnn
\ No newline at end of file +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp new file mode 100644 index 0000000000..a72ceca471 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.cpp @@ -0,0 +1,18 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClAdditionUint8Workload.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +void ClAdditionUint8Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClAdditionUint8Workload_Execute"); + ClAdditionBaseWorkload::Execute(); +} + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp new file mode 100644 index 0000000000..73ff287e7e --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClAdditionUint8Workload.hpp @@ -0,0 +1,20 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "ClAdditionBaseWorkload.hpp" + +namespace armnn +{ + +class ClAdditionUint8Workload : public ClAdditionBaseWorkload<DataType::QuantisedAsymm8> +{ +public: + using ClAdditionBaseWorkload<DataType::QuantisedAsymm8>::ClAdditionBaseWorkload; + void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp index 4b72d92d72..e0bc365053 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.cpp @@ -4,17 +4,19 @@ // #include "ClBaseConstantWorkload.hpp" +#include "backends/ArmComputeTensorUtils.hpp" #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" +#include "Half.hpp" namespace armnn { -template class ClBaseConstantWorkload<DataType::Float32>; +template class ClBaseConstantWorkload<DataType::Float16, DataType::Float32>; template class ClBaseConstantWorkload<DataType::QuantisedAsymm8>; -template<armnn::DataType dataType> -void ClBaseConstantWorkload<dataType>::Execute() const +template<armnn::DataType... dataTypes> +void ClBaseConstantWorkload<dataTypes...>::Execute() const { // The intermediate tensor held by the corresponding layer output handler can be initialised with the given data // on the first inference, then reused for subsequent inferences. @@ -26,15 +28,21 @@ void ClBaseConstantWorkload<dataType>::Execute() const BOOST_ASSERT(data.m_LayerOutput != nullptr); arm_compute::CLTensor& output = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetTensor(); + arm_compute::DataType computeDataType = static_cast<ClTensorHandle*>(data.m_Outputs[0])->GetDataType(); - switch (dataType) + switch (computeDataType) { - case DataType::Float32: + case arm_compute::DataType::F16: + { + CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<Half>(), output); + break; + } + case arm_compute::DataType::F32: { CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<float>(), output); break; } - case DataType::QuantisedAsymm8: + case arm_compute::DataType::QASYMM8: { CopyArmComputeClTensorData(data.m_LayerOutput->GetConstTensor<uint8_t>(), output); break; diff --git a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp index 660842f375..7ad7bb93ca 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseConstantWorkload.hpp @@ -9,12 +9,12 @@ namespace armnn { -template <armnn::DataType DataType> -class ClBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataType> +template <armnn::DataType... DataTypes> +class ClBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataTypes...> { public: ClBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload<ConstantQueueDescriptor, DataType>(descriptor, info) + : TypedWorkload<ConstantQueueDescriptor, DataTypes...>(descriptor, info) , m_RanOnce(false) { } diff --git a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp index 7542c62b47..531e32961b 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseMergerWorkload.hpp @@ -10,16 +10,16 @@ namespace armnn { -// Base class template providing an implementation of the Merger layer common to all data types -template <armnn::DataType DataType> -class ClBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataType> +// Base class template providing an implementation of the Merger layer common to all data types. +template <armnn::DataType... DataTypes> +class ClBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataTypes...> { public: - using TypedWorkload<MergerQueueDescriptor, DataType>::TypedWorkload; + using TypedWorkload<MergerQueueDescriptor, DataTypes...>::TypedWorkload; void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp index fef841ced2..8e4f10f9fd 100644 --- a/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBaseSplitterWorkload.hpp @@ -10,16 +10,16 @@ namespace armnn { -// Base class template providing an implementation of the Splitter layer common to all data types -template <armnn::DataType DataType> -class ClBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataType> +// Base class template providing an implementation of the Splitter layer common to all data types. +template <armnn::DataType... DataTypes> +class ClBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataTypes...> { public: - using TypedWorkload<SplitterQueueDescriptor, DataType>::TypedWorkload; + using TypedWorkload<SplitterQueueDescriptor, DataTypes...>::TypedWorkload; void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp index dabd495d59..1849c5d411 100644 --- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.cpp @@ -7,36 +7,88 @@ #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClLayerSupport.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor &desc) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean); + const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var); + const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta); + const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma); + + return arm_compute::CLBatchNormalizationLayer::validate(&aclInputInfo, + &aclOutputInfo, + &aclMeanInfo, + &aclVarInfo, + &aclBetaInfo, + &aclGammaInfo, + desc.m_Eps); +} + ClBatchNormalizationFloat32Workload::ClBatchNormalizationFloat32Workload( const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<BatchNormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info) { - BuildArmComputeTensor(m_Mean, m_Data.m_Mean->GetTensorInfo()); - BuildArmComputeTensor(m_Variance, m_Data.m_Variance->GetTensorInfo()); - BuildArmComputeTensor(m_Gamma, m_Data.m_Gamma->GetTensorInfo()); - BuildArmComputeTensor(m_Beta, m_Data.m_Beta->GetTensorInfo()); + m_Mean = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo()); + + m_Variance = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo()); + + m_Gamma = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo()); + + m_Beta = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo()); m_Data.ValidateInputsOutputs("ClBatchNormalizationFloat32Workload", 1, 1); arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - m_Layer.configure(&input, &output, &m_Mean, &m_Variance, &m_Beta, &m_Gamma, m_Data.m_Parameters.m_Eps); - InitialiseArmComputeClTensorData(m_Mean, m_Data.m_Mean->GetConstTensor<float>()); - InitialiseArmComputeClTensorData(m_Variance, m_Data.m_Variance->GetConstTensor<float>()); - InitialiseArmComputeClTensorData(m_Beta, m_Data.m_Beta->GetConstTensor<float>()); - InitialiseArmComputeClTensorData(m_Gamma, m_Data.m_Gamma->GetConstTensor<float>()); + m_Layer.configure(&input, + &output, + m_Mean.get(), + m_Variance.get(), + m_Beta.get(), + m_Gamma.get(), + m_Data.m_Parameters.m_Eps); + + InitializeArmComputeClTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean); + InitializeArmComputeClTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance); + InitializeArmComputeClTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta); + InitializeArmComputeClTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma); + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_Layer.prepare(); + FreeUnusedTensors(); } void ClBatchNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClBatchNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClBatchNormalizationFloat32Workload_Execute"); m_Layer.run(); } +void ClBatchNormalizationFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_Mean); + FreeTensorIfUnused(m_Variance); + FreeTensorIfUnused(m_Gamma); + FreeTensorIfUnused(m_Beta); +} + } //namespace armnn
\ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp index ddbd0f05c0..a45614a284 100644 --- a/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClBatchNormalizationFloat32Workload.hpp @@ -10,21 +10,31 @@ namespace armnn { -class ClBatchNormalizationFloat32Workload : public Float32Workload<BatchNormalizationQueueDescriptor> +arm_compute::Status ClBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor& desc); + +class ClBatchNormalizationFloat32Workload : public FloatWorkload<BatchNormalizationQueueDescriptor> { public: ClBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); - using Float32Workload<BatchNormalizationQueueDescriptor>::Float32Workload; + using FloatWorkload<BatchNormalizationQueueDescriptor>::FloatWorkload; void Execute() const override; private: mutable arm_compute::CLBatchNormalizationLayer m_Layer; - arm_compute::CLTensor m_Mean; - arm_compute::CLTensor m_Variance; - arm_compute::CLTensor m_Gamma; - arm_compute::CLTensor m_Beta; + std::unique_ptr<arm_compute::CLTensor> m_Mean; + std::unique_ptr<arm_compute::CLTensor> m_Variance; + std::unique_ptr<arm_compute::CLTensor> m_Gamma; + std::unique_ptr<arm_compute::CLTensor> m_Beta; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp index 99880d68a7..58594999a8 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.cpp @@ -9,7 +9,7 @@ namespace armnn void ClConstantFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantFloat32Workload_Execute"); ClBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp index 5f86d3b2b6..11c3fda8db 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConstantFloat32Workload.hpp @@ -9,10 +9,10 @@ namespace armnn { -class ClConstantFloat32Workload : public ClBaseConstantWorkload<DataType::Float32> +class ClConstantFloat32Workload : public ClBaseConstantWorkload<DataType::Float16, DataType::Float32> { public: - using ClBaseConstantWorkload<DataType::Float32>::ClBaseConstantWorkload; + using ClBaseConstantWorkload<DataType::Float16, DataType::Float32>::ClBaseConstantWorkload; void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp index 078d4261fa..82ce436557 100644 --- a/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConstantUint8Workload.cpp @@ -9,7 +9,7 @@ namespace armnn void ClConstantUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConstantUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConstantUint8Workload_Execute"); ClBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp new file mode 100644 index 0000000000..4914be78bc --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.cpp @@ -0,0 +1,64 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClConvertFp16ToFp32Workload.hpp" +#include "backends/ClTensorHandle.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +ClConvertFp16ToFp32Workload::ClConvertFp16ToFp32Workload( + const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info) : + Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClConvertFp16ToFp32Workload", 1, 1); + + arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor(); + + m_Layer.configure(&input, &output, g_AclConvertPolicy, 0); +} + +void ClConvertFp16ToFp32Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp16ToFp32Workload_Execute"); + m_Layer.run(); +} + +arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input.GetDataType() != DataType::Float16) + { + *reasonIfUnsupported = "Input should be Float16"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + if (output.GetDataType() != DataType::Float32) + { + *reasonIfUnsupported = "Output should be Float32"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate( + &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return aclStatus; +} + + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp new file mode 100644 index 0000000000..36ccbb7144 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp16ToFp32Workload.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +class ClConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor> +{ +public: + + ClConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + mutable arm_compute::CLDepthConvertLayer m_Layer; +}; + +arm_compute::Status ClConvertFp16ToFp32WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported); + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp new file mode 100644 index 0000000000..19e064351f --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.cpp @@ -0,0 +1,64 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClConvertFp32ToFp16Workload.hpp" +#include "backends/ClTensorHandle.hpp" + +namespace armnn +{ +using namespace armcomputetensorutils; + +static constexpr arm_compute::ConvertPolicy g_AclConvertPolicy = arm_compute::ConvertPolicy::SATURATE; + +ClConvertFp32ToFp16Workload::ClConvertFp32ToFp16Workload( + const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info) : + Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("ClConvertFp32ToFp16Workload", 1, 1); + + arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(this->m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(this->m_Data.m_Outputs[0])->GetTensor(); + + m_Layer.configure(&input, &output, g_AclConvertPolicy, 0); +} + +void ClConvertFp32ToFp16Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvertFp32ToFp16Workload_Execute"); + m_Layer.run(); +} + +arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + if (input.GetDataType() != DataType::Float32) + { + *reasonIfUnsupported = "Input should be Float32"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + if (output.GetDataType() != DataType::Float16) + { + *reasonIfUnsupported = "Output should be Float16"; + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, *reasonIfUnsupported); + } + + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + const arm_compute::Status aclStatus = arm_compute::CLDepthConvertLayer::validate( + &aclInputInfo, &aclOutputInfo, g_AclConvertPolicy, 0); + + const bool supported = (aclStatus.error_code() == arm_compute::ErrorCode::OK); + if (!supported && reasonIfUnsupported) + { + *reasonIfUnsupported = aclStatus.error_description(); + } + + return aclStatus; +} + + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp new file mode 100644 index 0000000000..02a442dabc --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClConvertFp32ToFp16Workload.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +class ClConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor> +{ +public: + + ClConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + mutable arm_compute::CLDepthConvertLayer m_Layer; +}; + +arm_compute::Status ClConvertFp32ToFp16WorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported); + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp index d7aef3d223..9ac31df5c1 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.cpp @@ -15,13 +15,15 @@ using namespace armcomputetensorutils; ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<Convolution2dQueueDescriptor>(descriptor, info) + : FloatWorkload<Convolution2dQueueDescriptor>(descriptor, info) , m_ConvolutionLayer(memoryManager) { - // todo: check tensor shapes match + // todo: check tensor shapes match. const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - BuildArmComputeTensor(m_KernelTensor, weightInfo); + + m_KernelTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY, @@ -31,11 +33,10 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution m_Data.m_Parameters.m_PadBottom, arm_compute::DimensionRoundingType::FLOOR); - arm_compute::CLTensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClConvolution2dFloat32Workload", 1, 1); @@ -44,24 +45,35 @@ ClConvolution2dFloat32Workload::ClConvolution2dFloat32Workload(const Convolution arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); m_ConvolutionLayer.configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_ConvolutionLayer.prepare(); + FreeUnusedTensors(); } void ClConvolution2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dFloat32Workload_Execute"); m_ConvolutionLayer.run(); } +void ClConvolution2dFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp index 4cf73c89cc..51c21aec32 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class ClConvolution2dFloat32Workload : public Float32Workload<Convolution2dQueueDescriptor> +class ClConvolution2dFloat32Workload : public FloatWorkload<Convolution2dQueueDescriptor> { public: ClConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, @@ -22,10 +22,12 @@ public: void Execute() const override; private: - mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; + mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_KernelTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp index cf419e752e..a78d7fb4a2 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.cpp @@ -18,10 +18,11 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu : Uint8Workload<Convolution2dQueueDescriptor>(descriptor, info) , m_ConvolutionLayer(memoryManager) { - // todo: check tensor shapes match const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - BuildArmComputeTensor(m_KernelTensor, weightInfo); + + m_KernelTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, m_Data.m_Parameters.m_StrideY, @@ -31,11 +32,10 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu m_Data.m_Parameters.m_PadBottom, arm_compute::DimensionRoundingType::FLOOR); - arm_compute::CLTensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClConvolution2dUint8Workload", 1, 1); @@ -44,25 +44,36 @@ ClConvolution2dUint8Workload::ClConvolution2dUint8Workload(const Convolution2dQu arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); m_ConvolutionLayer.configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>()); + InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>()); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<int32_t>()); + InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor<int32_t>()); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_ConvolutionLayer.prepare(); + FreeUnusedTensors(); } void ClConvolution2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClConvolution2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClConvolution2dUint8Workload_Execute"); m_ConvolutionLayer.run(); } +void ClConvolution2dUint8Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp index d4d3908c80..7d9eb76ba1 100644 --- a/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClConvolution2dUint8Workload.hpp @@ -22,10 +22,12 @@ public: void Execute() const override; private: - mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; + mutable arm_compute::CLConvolutionLayer m_ConvolutionLayer; - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_KernelTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp new file mode 100644 index 0000000000..cfb8485039 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.cpp @@ -0,0 +1,122 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + +#include "TypeUtils.hpp" + +#include "backends/ArmComputeUtils.hpp" +#include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" + +namespace armnn +{ + +using namespace armcomputetensorutils; + +arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeightsInfo = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiasesInfo; + arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiasesInfo = BuildArmComputeTensorInfo(biases); + optionalAclBiasesInfo = &aclBiasesInfo; + } + + const arm_compute::PadStrideInfo aclPadStrideInfo = BuildArmComputePadStrideInfo(descriptor); + const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + + return arm_compute::CLDepthwiseConvolutionLayer::validate(&aclInputInfo, + &aclWeightsInfo, + optionalAclBiasesInfo, + &aclOutputInfo, + aclPadStrideInfo, + aclDepthMultiplier); +} + +template<armnn::DataType... dataTypes> +ClDepthwiseConvolutionBaseWorkload<dataTypes...>::ClDepthwiseConvolutionBaseWorkload( + const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info) + : TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>(descriptor, info) +{ + auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); + + m_KernelTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); + + if (m_Data.m_Parameters.m_BiasEnabled) + { + m_BiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); + } + + arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, + m_Data.m_Parameters.m_StrideY, + m_Data.m_Parameters.m_PadLeft, + m_Data.m_Parameters.m_PadRight, + m_Data.m_Parameters.m_PadTop, + m_Data.m_Parameters.m_PadBottom, + arm_compute::DimensionRoundingType::FLOOR); + + std::string name = std::string("ClDepthwiseConvolution") + + GetDataTypeName(m_Data.m_Weight->GetTensorInfo().GetDataType()) + "Workload"; + m_Data.ValidateInputsOutputs(name, 1, 1); + + arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); + + const unsigned int depthMultiplier = weightInfo.GetShape()[0]; + + //Check for optimisation opportunities. + bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3); + if (use3x3Optimisation) + { + m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>(); + static_cast<arm_compute::CLDepthwiseConvolutionLayer3x3*>(m_DepthwiseConvolutionLayer.get())->configure( + &input, + m_KernelTensor.get(), + m_BiasTensor.get(), + &output, + padStrideInfo, + depthMultiplier); + } + else + { + m_DepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>(); + static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_DepthwiseConvolutionLayer.get())->configure( + &input, + m_KernelTensor.get(), + m_BiasTensor.get(), + &output, + padStrideInfo, + depthMultiplier); + } + + BOOST_ASSERT(m_DepthwiseConvolutionLayer); +} + +template<armnn::DataType... dataTypes> +void ClDepthwiseConvolutionBaseWorkload<dataTypes...>::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + +// Generate known implementations for linker +template class ClDepthwiseConvolutionBaseWorkload<DataType::Float16, DataType::Float32>; +template class ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8>; + +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp new file mode 100644 index 0000000000..a879efc89e --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionBaseWorkload.hpp @@ -0,0 +1,37 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases); + +template<armnn::DataType... dataTypes> +class ClDepthwiseConvolutionBaseWorkload : public TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...> +{ +public: + using TypedWorkload<DepthwiseConvolution2dQueueDescriptor, dataTypes...>::m_Data; + + ClDepthwiseConvolutionBaseWorkload(const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); + +protected: + std::unique_ptr<arm_compute::IFunction> m_DepthwiseConvolutionLayer; + + std::unique_ptr<arm_compute::CLTensor> m_KernelTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasTensor; + + void FreeUnusedTensors(); +}; + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp index f31c73bc60..96d97ad4ea 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.cpp @@ -4,8 +4,8 @@ // #include "ClDepthwiseConvolutionFloat32Workload.hpp" -#include "ClDepthwiseConvolutionHelper.hpp" -#include "backends/ClTensorHandle.hpp" + +#include "backends/ClWorkloadUtils.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn @@ -14,17 +14,25 @@ namespace armnn ClDepthwiseConvolutionFloat32Workload::ClDepthwiseConvolutionFloat32Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info) + : ClDepthwiseConvolutionBaseWorkload(descriptor, info) { - InitClDepthwiseConvolutionWorkload(*this); + InitializeArmComputeClTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); + + if (m_BiasTensor) + { + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); + } + + m_DepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void ClDepthwiseConvolutionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionFloat32Workload_Execute"); - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionFloat32Workload_Execute"); + BOOST_ASSERT(m_DepthwiseConvolutionLayer); - m_pDepthwiseConvolutionLayer->run(); + m_DepthwiseConvolutionLayer->run(); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp index 8711f0c515..669fd928b5 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionFloat32Workload.hpp @@ -5,29 +5,20 @@ #pragma once +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + #include "backends/ClWorkloadUtils.hpp" namespace armnn { -class ClDepthwiseConvolutionFloat32Workload : public Float32Workload<DepthwiseConvolution2dQueueDescriptor> +class ClDepthwiseConvolutionFloat32Workload : public ClDepthwiseConvolutionBaseWorkload<DataType::Float16, + DataType::Float32> { public: ClDepthwiseConvolutionFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; - -private: - typedef float KernelDataType; - typedef float BiasDataType; - - mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer; - - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; - - template <typename WorkloadType> - friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp deleted file mode 100644 index cd7115773d..0000000000 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionHelper.hpp +++ /dev/null @@ -1,91 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// - -#pragma once - -#include <armnn/TypesUtils.hpp> -#include "backends/ClLayerSupport.hpp" -#include "backends/ArmComputeTensorUtils.hpp" -#include "backends/ClTensorHandle.hpp" - -namespace armnn -{ - -template <typename WorkloadType> -void InitClDepthwiseConvolutionWorkload(WorkloadType& workload) -{ - using T = typename WorkloadType::KernelDataType; - using B = typename WorkloadType::BiasDataType; - - auto& m_Data = workload.GetData(); - auto& m_KernelTensor = workload.m_KernelTensor; - auto& m_BiasTensor = workload.m_BiasTensor; - auto& m_pDepthwiseConvolutionLayer = workload.m_pDepthwiseConvolutionLayer; - - auto& weightInfo = m_Data.m_Weight->GetTensorInfo(); - - std::string reasonIfUnsupported; - if (!IsClDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo)) - { - throw UnimplementedException(reasonIfUnsupported); - } - - armcomputetensorutils::BuildArmComputeTensor(m_KernelTensor, weightInfo); - - arm_compute::CLTensor* optionalBias = nullptr; - if (m_Data.m_Parameters.m_BiasEnabled) - { - armcomputetensorutils::BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; - } - - arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, - m_Data.m_Parameters.m_StrideY, - m_Data.m_Parameters.m_PadLeft, - m_Data.m_Parameters.m_PadRight, - m_Data.m_Parameters.m_PadTop, - m_Data.m_Parameters.m_PadBottom, - arm_compute::DimensionRoundingType::FLOOR); - - std::string name = std::string("ClDepthwiseConvolution") + GetDataTypeName(GetDataType<T>()) + "Workload"; - m_Data.ValidateInputsOutputs(name, 1, 1); - - arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); - arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - - //Check for optimisation opportunities. - bool use3x3Optimisation = (weightInfo.GetShape()[3] == 3) && (weightInfo.GetShape()[2] == 3); - if (use3x3Optimisation) - { - m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer3x3>(); - static_cast<arm_compute::CLDepthwiseConvolutionLayer3x3*>(m_pDepthwiseConvolutionLayer.get())->configure( - &input, - &m_KernelTensor, - optionalBias, - &output, - padStrideInfo); - } - else - { - m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::CLDepthwiseConvolutionLayer>(); - static_cast<arm_compute::CLDepthwiseConvolutionLayer*>(m_pDepthwiseConvolutionLayer.get())->configure( - &input, - &m_KernelTensor, - optionalBias, - &output, - padStrideInfo); - } - - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - - InitialiseArmComputeClTensorData(m_KernelTensor, m_Data.m_Weight->template GetConstTensor<T>()); - - if (optionalBias) - { - InitialiseArmComputeClTensorData(*optionalBias, m_Data.m_Bias->template GetConstTensor<B>()); - } -} - -} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp index 7e7c488c74..4852ce8bf9 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.cpp @@ -4,28 +4,34 @@ // #include "ClDepthwiseConvolutionUint8Workload.hpp" -#include "ClDepthwiseConvolutionHelper.hpp" -#include "backends/ClTensorHandle.hpp" + #include "backends/CpuTensorHandle.hpp" namespace armnn { - ClDepthwiseConvolutionUint8Workload::ClDepthwiseConvolutionUint8Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Uint8Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info) + : ClDepthwiseConvolutionBaseWorkload(descriptor, info) { - InitClDepthwiseConvolutionWorkload(*this); + InitialiseArmComputeClTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<uint8_t>()); + + if (m_BiasTensor) + { + InitialiseArmComputeClTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>()); + } + + m_DepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void ClDepthwiseConvolutionUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClDepthwiseConvolutionUint8Workload_Execute"); - BOOST_ASSERT(m_pDepthwiseConvolutionLayer); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClDepthwiseConvolutionUint8Workload_Execute"); + BOOST_ASSERT(m_DepthwiseConvolutionLayer); - m_pDepthwiseConvolutionLayer->run(); + m_DepthwiseConvolutionLayer->run(); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp index ee09ff3e58..a4277d405f 100644 --- a/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClDepthwiseConvolutionUint8Workload.hpp @@ -5,29 +5,19 @@ #pragma once +#include "ClDepthwiseConvolutionBaseWorkload.hpp" + #include "backends/ClWorkloadUtils.hpp" namespace armnn { -class ClDepthwiseConvolutionUint8Workload : public Uint8Workload<DepthwiseConvolution2dQueueDescriptor> +class ClDepthwiseConvolutionUint8Workload : public ClDepthwiseConvolutionBaseWorkload<DataType::QuantisedAsymm8> { public: ClDepthwiseConvolutionUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; - -private: - typedef uint8_t KernelDataType; - typedef int32_t BiasDataType; - - mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer; - - arm_compute::CLTensor m_KernelTensor; - arm_compute::CLTensor m_BiasTensor; - - template <typename WorkloadType> - friend void InitClDepthwiseConvolutionWorkload(WorkloadType& workload); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp index 882da50855..da71c50305 100644 --- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn { ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<FloorQueueDescriptor>(descriptor, info) + : FloatWorkload<FloorQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClFloorFloat32Workload", 1, 1); @@ -22,7 +22,7 @@ ClFloorFloat32Workload::ClFloorFloat32Workload(const FloorQueueDescriptor& descr void ClFloorFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFloorFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClFloorFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp index 532dd29884..bd7f3032fc 100644 --- a/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClFloorFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClFloorFloat32Workload : public Float32Workload<FloorQueueDescriptor> +class ClFloorFloat32Workload : public FloatWorkload<FloorQueueDescriptor> { public: ClFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp index 5dfab9cbbd..5014dd27ca 100644 --- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.cpp @@ -7,47 +7,89 @@ #include "backends/ClTensorHandle.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ArmComputeUtils.hpp" +#include "backends/ClLayerSupport.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiases; + arm_compute::TensorInfo *optionalAclBiases = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiases = BuildArmComputeTensorInfo(biases); + optionalAclBiases = &aclBiases; + } + + const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo = + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor); + + return arm_compute::CLFullyConnectedLayer::validate(&aclInput, + &aclWeights, + optionalAclBiases, + &aclOutput, + fullyConnectedLayerInfo); +} + ClFullyConnectedFloat32Workload::ClFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<FullyConnectedQueueDescriptor>(descriptor, info) - , m_FullyConnected(memoryManager) + : FloatWorkload<FullyConnectedQueueDescriptor>(descriptor, info) + , m_FullyConnectedLayer(memoryManager) { + m_WeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - BuildArmComputeTensor(m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - - arm_compute::CLTensor* optionalBiasTensor = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBiasTensor = &m_BiasesTensor; + m_BiasesTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); } m_Data.ValidateInputsOutputs("ClFullyConnectedFloat32Workload", 1, 1); arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); + // Construct - m_FullyConnected.configure( - &input, &m_WeightsTensor, optionalBiasTensor, &output, m_Data.m_Parameters.m_TransposeWeightMatrix); + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix; + m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info); // Allocate - InitialiseArmComputeClTensorData(m_WeightsTensor, m_Data.m_Weight->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight); - if (optionalBiasTensor) + if (m_BiasesTensor) { - InitialiseArmComputeClTensorData(*optionalBiasTensor, m_Data.m_Bias->GetConstTensor<float>()); + InitializeArmComputeClTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_FullyConnectedLayer.prepare(); + FreeUnusedTensors(); } void ClFullyConnectedFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClFullyConnectedFloat32Workload_Execute"); - m_FullyConnected.run(); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClFullyConnectedFloat32Workload_Execute"); + m_FullyConnectedLayer.run(); +} + +void ClFullyConnectedFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_WeightsTensor); + FreeTensorIfUnused(m_BiasesTensor); } } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp index c8d1227bda..f580e580c6 100644 --- a/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClFullyConnectedFloat32Workload.hpp @@ -14,20 +14,29 @@ namespace armnn { -class ClFullyConnectedFloat32Workload : public armnn::Float32Workload<armnn::FullyConnectedQueueDescriptor> +arm_compute::Status ClFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor); + +class ClFullyConnectedFloat32Workload : public armnn::FloatWorkload<armnn::FullyConnectedQueueDescriptor> { public: ClFullyConnectedFloat32Workload(const armnn::FullyConnectedQueueDescriptor& descriptor, const armnn::WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager); - using armnn::Float32Workload<armnn::FullyConnectedQueueDescriptor>::m_Data; + using armnn::FloatWorkload<armnn::FullyConnectedQueueDescriptor>::m_Data; void Execute() const override; private: - mutable arm_compute::CLFullyConnectedLayer m_FullyConnected; - arm_compute::CLTensor m_WeightsTensor; - arm_compute::CLTensor m_BiasesTensor; + mutable arm_compute::CLFullyConnectedLayer m_FullyConnectedLayer; + + std::unique_ptr<arm_compute::CLTensor> m_WeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_BiasesTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp index e15db74ec9..628e38d3da 100644 --- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.cpp @@ -12,9 +12,21 @@ namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + + arm_compute::NormalizationLayerInfo normalizationInfo = + CreateAclNormalizationLayerInfoForL2Normalization(input); + + return arm_compute::CLNormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo); +} + ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<L2NormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClL2NormalizationFloat32Workload", 1, 1); @@ -25,7 +37,7 @@ ClL2NormalizationFloat32Workload::ClL2NormalizationFloat32Workload(const L2Norma void ClL2NormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClL2NormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClL2NormalizationFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp index 848803e2f0..bf898e31f7 100644 --- a/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClL2NormalizationFloat32Workload.hpp @@ -10,7 +10,10 @@ namespace armnn { -class ClL2NormalizationFloat32Workload : public Float32Workload<L2NormalizationQueueDescriptor> +arm_compute::Status ClL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +class ClL2NormalizationFloat32Workload : public FloatWorkload<L2NormalizationQueueDescriptor> { public: ClL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp new file mode 100644 index 0000000000..db5c303854 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.cpp @@ -0,0 +1,405 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClLstmFloat32Workload.hpp" +#include "backends/ClTensorHandle.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ClLayerSupport.hpp" +#include "arm_compute/runtime/CL/functions/CLLSTMLayer.h" + +namespace armnn +{ +using namespace armcomputetensorutils; + +ClLstmFloat32Workload::ClLstmFloat32Workload(const LstmQueueDescriptor &descriptor, const WorkloadInfo &info) + : FloatWorkload<LstmQueueDescriptor>(descriptor, info) +{ + arm_compute::LSTMParams<arm_compute::ICLTensor> lstm_param; + + // Basic parameters + m_InputToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToForgetWeightsTensor, m_Data.m_InputToForgetWeights->GetTensorInfo()); + + m_InputToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToCellWeightsTensor, m_Data.m_InputToCellWeights->GetTensorInfo()); + + m_InputToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToOutputWeightsTensor, m_Data.m_InputToOutputWeights->GetTensorInfo()); + + m_RecurrentToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToForgetWeightsTensor, m_Data.m_RecurrentToForgetWeights->GetTensorInfo()); + + m_RecurrentToCellWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToCellWeightsTensor, m_Data.m_RecurrentToCellWeights->GetTensorInfo()); + + m_RecurrentToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToOutputWeightsTensor, m_Data.m_RecurrentToOutputWeights->GetTensorInfo()); + + m_ForgetGateBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_ForgetGateBiasTensor, m_Data.m_ForgetGateBias->GetTensorInfo()); + + m_CellBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_CellBiasTensor, m_Data.m_CellBias->GetTensorInfo()); + + m_OutputGateBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_OutputGateBiasTensor, m_Data.m_OutputGateBias->GetTensorInfo()); + + // for future reference: check the AndroidNN API for the logic here + if (!m_Data.m_Parameters.m_CifgEnabled) + { + m_InputToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputToInputWeightsTensor, m_Data.m_InputToInputWeights->GetTensorInfo()); + + m_RecurrentToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_RecurrentToInputWeightsTensor, m_Data.m_RecurrentToInputWeights->GetTensorInfo()); + + m_CellToInputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + if (m_Data.m_CellToInputWeights != nullptr) + { + BuildArmComputeTensor(*m_CellToInputWeightsTensor, m_Data.m_CellToInputWeights->GetTensorInfo()); + } + + m_InputGateBiasTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_InputGateBiasTensor, m_Data.m_InputGateBias->GetTensorInfo()); + + lstm_param.set_cifg_params(m_InputToInputWeightsTensor.get(), + m_RecurrentToInputWeightsTensor.get(), + m_Data.m_CellToInputWeights != nullptr ? m_CellToInputWeightsTensor.get() : nullptr, + m_InputGateBiasTensor.get()); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + m_ProjectionWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_ProjectionWeightsTensor, m_Data.m_ProjectionWeights->GetTensorInfo()); + + m_ProjectionBiasTensor = std::make_unique<arm_compute::CLTensor>(); + if (m_Data.m_ProjectionBias != nullptr) + { + BuildArmComputeTensor(*m_ProjectionBiasTensor, m_Data.m_ProjectionBias->GetTensorInfo()); + } + + lstm_param.set_projection_params(m_ProjectionWeightsTensor.get(), + m_Data.m_ProjectionBias != nullptr ? m_ProjectionBiasTensor.get() : nullptr); + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + m_CellToForgetWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_CellToForgetWeightsTensor, m_Data.m_CellToForgetWeights->GetTensorInfo()); + + m_CellToOutputWeightsTensor = std::make_unique<arm_compute::CLTensor>(); + BuildArmComputeTensor(*m_CellToOutputWeightsTensor, m_Data.m_CellToOutputWeights->GetTensorInfo()); + + lstm_param.set_peephole_params(m_CellToForgetWeightsTensor.get(), m_CellToOutputWeightsTensor.get()); + } + + const arm_compute::ICLTensor& input = static_cast<IClTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); + const arm_compute::ICLTensor& output_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[1])->GetTensor(); + const arm_compute::ICLTensor& cell_state_in = static_cast<IClTensorHandle*>(m_Data.m_Inputs[2])->GetTensor(); + + arm_compute::ICLTensor& output_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[1])->GetTensor(); + arm_compute::ICLTensor& cell_state_out = static_cast<IClTensorHandle*>(m_Data.m_Outputs[2])->GetTensor(); + arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[3])->GetTensor(); + + // Get the batch_size and the num_units from the cellStateIn dimensions + const TensorInfo& inputTensorInfo = info.m_InputTensorInfos[2]; + const unsigned int batch_size = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[0]); + const unsigned int num_units = boost::numeric_cast<unsigned int>(inputTensorInfo.GetShape()[1]); + + m_ScratchBuffer = std::make_unique<arm_compute::CLTensor>(); + if (m_Data.m_Parameters.m_CifgEnabled) + { + // 2D tensor with dimensions [num_units * 4, batch_size] with CIFG + armnn::TensorInfo scratchBuffer1({ batch_size, num_units * 4 }, DataType::Float32); + BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer1); + } + else + { + // scratch_buffer [num_units * 3, batch_size] without CIFG + armnn::TensorInfo scratchBuffer2({ batch_size, num_units * 3 }, DataType::Float32); + BuildArmComputeTensor(*m_ScratchBuffer, scratchBuffer2); + } + + float cell_threshold = m_Data.m_Parameters.m_ClippingThresCell; + float projection_threshold = m_Data.m_Parameters.m_ClippingThresProj; + + // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations + arm_compute::ActivationLayerInfo activationLayerInfo; + if (m_Data.m_Parameters.m_ActivationFunc == 0) + { + // no activation, do nothing + } + else if (m_Data.m_Parameters.m_ActivationFunc == 1) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::RELU); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 3) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 4) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0); + } + else if (m_Data.m_Parameters.m_ActivationFunc == 6) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC); + } + else + { + throw armnn::Exception("Wrong Type of Activation Function!"); + } + + + m_LstmLayer.configure(&input, m_InputToForgetWeightsTensor.get(), m_InputToCellWeightsTensor.get(), + m_InputToOutputWeightsTensor.get(), m_RecurrentToForgetWeightsTensor.get(), + m_RecurrentToCellWeightsTensor.get(), m_RecurrentToOutputWeightsTensor.get(), + m_ForgetGateBiasTensor.get(), m_CellBiasTensor.get(), m_OutputGateBiasTensor.get(), + &output_state_in, &cell_state_in, m_ScratchBuffer.get(), &output_state_out, + &cell_state_out, &output, lstm_param, activationLayerInfo, + cell_threshold, projection_threshold); + + armcomputetensorutils::InitialiseArmComputeTensorEmpty(*m_ScratchBuffer); + + InitialiseArmComputeClTensorData(*m_InputToForgetWeightsTensor, + m_Data.m_InputToForgetWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_InputToCellWeightsTensor, + m_Data.m_InputToCellWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_InputToOutputWeightsTensor, + m_Data.m_InputToOutputWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToForgetWeightsTensor, + m_Data.m_RecurrentToForgetWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToCellWeightsTensor, + m_Data.m_RecurrentToCellWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToOutputWeightsTensor, + m_Data.m_RecurrentToOutputWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_ForgetGateBiasTensor, + m_Data.m_ForgetGateBias->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_CellBiasTensor, + m_Data.m_CellBias->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_OutputGateBiasTensor, + m_Data.m_OutputGateBias->GetConstTensor<float>()); + + if (!m_Data.m_Parameters.m_CifgEnabled) + { + InitialiseArmComputeClTensorData(*m_InputToInputWeightsTensor, + m_Data.m_InputToInputWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_RecurrentToInputWeightsTensor, + m_Data.m_RecurrentToInputWeights->GetConstTensor<float>()); + if (m_Data.m_CellToInputWeights != nullptr) + { + InitialiseArmComputeClTensorData(*m_CellToInputWeightsTensor, + m_Data.m_CellToInputWeights->GetConstTensor<float>()); + } + InitialiseArmComputeClTensorData(*m_InputGateBiasTensor, + m_Data.m_InputGateBias->GetConstTensor<float>()); + } + + if (m_Data.m_Parameters.m_ProjectionEnabled) + { + InitialiseArmComputeClTensorData(*m_ProjectionWeightsTensor, + m_Data.m_ProjectionWeights->GetConstTensor<float>()); + if (m_Data.m_ProjectionBias != nullptr) + { + InitialiseArmComputeClTensorData(*m_ProjectionBiasTensor, + m_Data.m_ProjectionBias->GetConstTensor<float>()); + } + } + + if (m_Data.m_Parameters.m_PeepholeEnabled) + { + InitialiseArmComputeClTensorData(*m_CellToForgetWeightsTensor, + m_Data.m_CellToForgetWeights->GetConstTensor<float>()); + InitialiseArmComputeClTensorData(*m_CellToOutputWeightsTensor, + m_Data.m_CellToOutputWeights->GetConstTensor<float>()); + } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_LstmLayer.prepare(); + FreeUnusedTensors(); +} + +void ClLstmFloat32Workload::Execute() const +{ + m_LstmLayer.run(); +} + +arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, + const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, + const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, + const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, + const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, + const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, + const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, + const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights) +{ + arm_compute::LSTMParams<arm_compute::ITensorInfo> lstm_params_info; + + // The inputs and the outputs + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputStateInInfo = BuildArmComputeTensorInfo(outputStateIn); + const arm_compute::TensorInfo aclCellStateInInfo = BuildArmComputeTensorInfo(cellStateIn); + const arm_compute::TensorInfo aclScratchBufferInfo = BuildArmComputeTensorInfo(scratchBuffer); + const arm_compute::TensorInfo aclOutputStateOutInfo = BuildArmComputeTensorInfo(outputStateOut); + const arm_compute::TensorInfo aclCellStateOutInfo = BuildArmComputeTensorInfo(cellStateOut); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + + // Basic parameters + const arm_compute::TensorInfo aclInputToForgetWeightsInfo = BuildArmComputeTensorInfo(inputToForgetWeights); + const arm_compute::TensorInfo aclInputToCellWeightsInfo = BuildArmComputeTensorInfo(inputToCellWeights); + const arm_compute::TensorInfo aclInputToOutputWeightsInfo = BuildArmComputeTensorInfo(inputToOutputWeights); + const arm_compute::TensorInfo aclRecurrentToForgetWeightsInfo + = BuildArmComputeTensorInfo(recurrentToForgetWeights); + const arm_compute::TensorInfo aclRecurrentToCellWeightsInfo + = BuildArmComputeTensorInfo(recurrentToCellWeights); + const arm_compute::TensorInfo aclRecurrentToOutputWeightsInfo + = BuildArmComputeTensorInfo(recurrentToOutputWeights); + const arm_compute::TensorInfo aclForgetGateBiasInfo = BuildArmComputeTensorInfo(forgetGateBias); + const arm_compute::TensorInfo aclCellBiasInfo = BuildArmComputeTensorInfo(cellBias); + const arm_compute::TensorInfo aclOutputGateBiasInfo = BuildArmComputeTensorInfo(outputGateBias); + + arm_compute::TensorInfo aclInputToInputWeightsInfo; + arm_compute::TensorInfo aclRecurrentToInputWeightsInfo; + arm_compute::TensorInfo aclCellToInputWeightsInfo; + arm_compute::TensorInfo aclInputGateBiasInfo; + arm_compute::TensorInfo aclProjectionWeightsInfo; + arm_compute::TensorInfo aclProjectionBiasInfo; + arm_compute::TensorInfo aclCellToForgetWeightsInfo; + arm_compute::TensorInfo aclCellToOutputWeightsInfo; + + if (!descriptor.m_CifgEnabled) + { + armnn::TensorInfo inputToInputWInfo = *inputToInputWeights; + aclInputToInputWeightsInfo = BuildArmComputeTensorInfo(inputToInputWInfo); + armnn::TensorInfo recurrentToInputWInfo = *recurrentToInputWeights; + aclRecurrentToInputWeightsInfo = BuildArmComputeTensorInfo(recurrentToInputWInfo); + + if (cellToInputWeights != nullptr) + { + armnn::TensorInfo cellToInputWInfo = *cellToInputWeights; + aclCellToInputWeightsInfo = BuildArmComputeTensorInfo(cellToInputWInfo); + } + armnn::TensorInfo inputGateBiasInfo = *inputGateBias; + aclInputGateBiasInfo = BuildArmComputeTensorInfo(inputGateBiasInfo); + lstm_params_info.set_cifg_params(&aclInputToInputWeightsInfo, &aclRecurrentToInputWeightsInfo, + cellToInputWeights != nullptr ? &aclCellToInputWeightsInfo: nullptr, + &aclInputGateBiasInfo); + } + + if (descriptor.m_ProjectionEnabled) + { + const armnn::TensorInfo& projectionWInfo = *projectionWeights; + aclProjectionWeightsInfo = BuildArmComputeTensorInfo(projectionWInfo); + + if (projectionBias != nullptr) + { + const armnn::TensorInfo& projectionBiasInfo = *projectionBias; + aclProjectionBiasInfo = BuildArmComputeTensorInfo(projectionBiasInfo); + } + lstm_params_info.set_projection_params(&aclProjectionWeightsInfo, + projectionBias != nullptr ? &aclProjectionBiasInfo: nullptr); + } + + if (descriptor.m_PeepholeEnabled) + { + const armnn::TensorInfo& cellToForgetWInfo = *cellToForgetWeights; + aclCellToForgetWeightsInfo = BuildArmComputeTensorInfo(cellToForgetWInfo); + const armnn::TensorInfo& cellToOutputWInfo = *cellToOutputWeights; + aclCellToOutputWeightsInfo = BuildArmComputeTensorInfo(cellToOutputWInfo); + lstm_params_info.set_peephole_params(&aclCellToForgetWeightsInfo, &aclCellToOutputWeightsInfo); + } + + float cell_threshold = descriptor.m_ClippingThresCell; + float projection_threshold = descriptor.m_ClippingThresProj; + + // for preparing the object for the class ActivationLayerInfo, we need to consider 5 situations + arm_compute::ActivationLayerInfo activationLayerInfo; + if (descriptor.m_ActivationFunc == 0) + { + // no activation, do nothing + } + else if (descriptor.m_ActivationFunc == 1) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::RELU); + } + else if (descriptor.m_ActivationFunc == 3) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, 6.0); + } + else if (descriptor.m_ActivationFunc == 4) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::TANH, 1.0, 1.0); + } + else if (descriptor.m_ActivationFunc == 6) + { + activationLayerInfo = arm_compute::ActivationLayerInfo( + arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC); + } + else + { + throw armnn::Exception("Wrong Type of Activation Function!"); + } + + return arm_compute::CLLSTMLayer::validate(&aclInputInfo, &aclInputToForgetWeightsInfo, + &aclInputToCellWeightsInfo, + &aclInputToOutputWeightsInfo, + &aclRecurrentToForgetWeightsInfo, + &aclRecurrentToCellWeightsInfo, + &aclRecurrentToOutputWeightsInfo, + &aclForgetGateBiasInfo, + &aclCellBiasInfo, + &aclOutputGateBiasInfo, + &aclOutputStateInInfo, &aclCellStateInInfo, + &aclScratchBufferInfo, &aclOutputStateOutInfo, + &aclCellStateOutInfo, &aclOutputInfo, + lstm_params_info, activationLayerInfo, + cell_threshold, projection_threshold); +} + +void ClLstmFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_InputToInputWeightsTensor); + FreeTensorIfUnused(m_InputToForgetWeightsTensor); + FreeTensorIfUnused(m_InputToCellWeightsTensor); + FreeTensorIfUnused(m_InputToOutputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToInputWeightsTensor); + FreeTensorIfUnused(m_RecurrentToForgetWeightsTensor); + FreeTensorIfUnused(m_RecurrentToCellWeightsTensor); + FreeTensorIfUnused(m_RecurrentToOutputWeightsTensor); + FreeTensorIfUnused(m_CellToInputWeightsTensor); + FreeTensorIfUnused(m_CellToForgetWeightsTensor); + FreeTensorIfUnused(m_CellToOutputWeightsTensor); + FreeTensorIfUnused(m_InputGateBiasTensor); + FreeTensorIfUnused(m_ForgetGateBiasTensor); + FreeTensorIfUnused(m_CellBiasTensor); + FreeTensorIfUnused(m_OutputGateBiasTensor); + FreeTensorIfUnused(m_ProjectionWeightsTensor); + FreeTensorIfUnused(m_ProjectionBiasTensor); + FreeTensorIfUnused(m_ScratchBuffer); +} + +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp new file mode 100644 index 0000000000..e2358ad10d --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClLstmFloat32Workload.hpp @@ -0,0 +1,67 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class ClLstmFloat32Workload : public FloatWorkload<LstmQueueDescriptor> +{ +public: + ClLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info); + void Execute() const override; + +private: + mutable arm_compute::CLLSTMLayer m_LstmLayer; + + std::unique_ptr<arm_compute::CLTensor> m_InputToInputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputToForgetWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputToCellWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputToOutputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToInputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToForgetWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToCellWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_RecurrentToOutputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellToInputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellToForgetWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellToOutputWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_InputGateBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_ForgetGateBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_CellBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_OutputGateBiasTensor; + std::unique_ptr<arm_compute::CLTensor> m_ProjectionWeightsTensor; + std::unique_ptr<arm_compute::CLTensor> m_ProjectionBiasTensor; + + std::unique_ptr<arm_compute::CLTensor> m_ScratchBuffer; + + void FreeUnusedTensors(); +}; + +arm_compute::Status ClLstmFloat32WorkloadValidate(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor &descriptor, + const TensorInfo& inputToForgetWeights, + const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, + const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, + const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, + const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, + const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, + const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, + const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights); +} //namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp index 4d2d708a0e..89e7690a36 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.cpp @@ -11,7 +11,7 @@ namespace armnn void ClMergerFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerFloat32Workload_Execute"); ClBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp index 9808d30ccf..3cafa23c1e 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClMergerFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class ClMergerFloat32Workload : public ClBaseMergerWorkload<armnn::DataType::Float32> +class ClMergerFloat32Workload : public ClBaseMergerWorkload<DataType::Float16, DataType::Float32> { public: - using ClBaseMergerWorkload<armnn::DataType::Float32>::ClBaseMergerWorkload; + using ClBaseMergerWorkload<DataType::Float16, DataType::Float32>::ClBaseMergerWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp index 94a1d3c593..551135b7da 100644 --- a/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMergerUint8Workload.cpp @@ -11,7 +11,7 @@ namespace armnn void ClMergerUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMergerUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMergerUint8Workload_Execute"); ClBaseMergerWorkload<DataType::QuantisedAsymm8>::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp index 405d109aa1..7aa33146f3 100644 --- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.cpp @@ -10,9 +10,29 @@ namespace armnn { +arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, + // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be + // ignored for F32 tensors. + return arm_compute::CLPixelWiseMultiplication::validate(&aclInput1, + &aclInput2, + &aclOutput, + 1.0f, + arm_compute::ConvertPolicy::SATURATE, + arm_compute::RoundingPolicy::TO_ZERO); +} + + ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<MultiplicationQueueDescriptor>(descriptor, info) + : FloatWorkload<MultiplicationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClMultiplicationFloat32Workload", 2, 1); @@ -30,9 +50,9 @@ ClMultiplicationFloat32Workload::ClMultiplicationFloat32Workload(const Multiplic void ClMultiplicationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClMultiplicationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClMultiplicationFloat32Workload_Execute"); - // Execute the layer + // Executes the layer. m_PixelWiseMultiplication.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp index 8e387118e8..0d6199047d 100644 --- a/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClMultiplicationFloat32Workload.hpp @@ -9,12 +9,17 @@ namespace armnn { -class ClMultiplicationFloat32Workload : public Float32Workload<MultiplicationQueueDescriptor> + +arm_compute::Status ClMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output); + +class ClMultiplicationFloat32Workload : public FloatWorkload<MultiplicationQueueDescriptor> { public: ClMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info); - using Float32Workload<MultiplicationQueueDescriptor>::Float32Workload; + using FloatWorkload<MultiplicationQueueDescriptor>::FloatWorkload; void Execute() const override; private: diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp index a163ec2883..d23d6e11bd 100644 --- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.cpp @@ -27,7 +27,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input, con ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<NormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClNormalizationFloat32Workload", 1, 1); @@ -42,7 +42,7 @@ ClNormalizationFloat32Workload::ClNormalizationFloat32Workload(const Normalizati void ClNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClNormalizationFloat32Workload_Execute"); m_NormalizationLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp index cbd5fa92a9..e8ab0b9a18 100644 --- a/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClNormalizationFloat32Workload.hpp @@ -14,7 +14,7 @@ arm_compute::Status ClNormalizationWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const NormalizationDescriptor& descriptor); -class ClNormalizationFloat32Workload : public Float32Workload<NormalizationQueueDescriptor> +class ClNormalizationFloat32Workload : public FloatWorkload<NormalizationQueueDescriptor> { public: ClNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp index 3147e95b2e..3c132cb8f8 100644 --- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.cpp @@ -24,10 +24,10 @@ arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descripto return arm_compute::Status{}; } -template <armnn::DataType DataType> -ClPermuteWorkload<DataType>::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, +template <armnn::DataType... DataTypes> +ClPermuteWorkload<DataTypes...>::ClPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload<PermuteQueueDescriptor, DataType>(descriptor, info) + : TypedWorkload<PermuteQueueDescriptor, DataTypes...>(descriptor, info) { using armcomputetensorutils::BuildArmComputePermutationVector; @@ -37,18 +37,18 @@ ClPermuteWorkload<DataType>::ClPermuteWorkload(const PermuteQueueDescriptor& des arm_compute::ICLTensor& output = static_cast<IClTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings; - // Run the layer + // Run the layer. m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings)); } -template <armnn::DataType DataType> -void ClPermuteWorkload<DataType>::Execute() const +template <armnn::DataType... DataTypes> +void ClPermuteWorkload<DataTypes...>::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, GetName() + "_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL( GetName() + "_Execute"); m_PermuteFunction.run(); } -template class ClPermuteWorkload<DataType::Float32>; +template class ClPermuteWorkload<DataType::Float16, DataType::Float32>; template class ClPermuteWorkload<DataType::QuantisedAsymm8>; } // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp index 430c59524e..c8726bc2c6 100644 --- a/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPermuteWorkload.hpp @@ -7,6 +7,7 @@ #include "backends/Workload.hpp" #include "backends/WorkloadData.hpp" +#include "backends/ClWorkloadUtils.hpp" #include <armnn/TypesUtils.hpp> #include <arm_compute/runtime/CL/functions/CLPermute.h> @@ -18,13 +19,13 @@ namespace armnn arm_compute::Status ClPermuteWorkloadValidate(const PermuteDescriptor& descriptor); -template <armnn::DataType DataType> -class ClPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataType> +template<armnn::DataType... DataTypes> +class ClPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataTypes...> { public: static const std::string& GetName() { - static const std::string name = std::string("ClPermute") + GetDataTypeName(DataType) + "Workload"; + static const std::string name = std::string("ClPermuteWorkload"); return name; } @@ -32,11 +33,11 @@ public: void Execute() const override; private: - using TypedWorkload<PermuteQueueDescriptor, DataType>::m_Data; + using TypedWorkload<PermuteQueueDescriptor, DataTypes...>::m_Data; mutable arm_compute::CLPermute m_PermuteFunction; }; -using ClPermuteFloat32Workload = ClPermuteWorkload<DataType::Float32>; +using ClPermuteFloatWorkload = ClPermuteWorkload<DataType::Float16, DataType::Float32>; using ClPermuteUint8Workload = ClPermuteWorkload<DataType::QuantisedAsymm8>; -} //namespace armnn +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp index dbdc06f174..6b8a230912 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.cpp @@ -25,10 +25,10 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input, return arm_compute::CLPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo); } -template <armnn::DataType dataType> -ClPooling2dBaseWorkload<dataType>::ClPooling2dBaseWorkload( +template <armnn::DataType... dataTypes> +ClPooling2dBaseWorkload<dataTypes...>::ClPooling2dBaseWorkload( const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name) - : TypedWorkload<Pooling2dQueueDescriptor, dataType>(descriptor, info) + : TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>(descriptor, info) { m_Data.ValidateInputsOutputs(name, 1, 1); @@ -37,11 +37,11 @@ ClPooling2dBaseWorkload<dataType>::ClPooling2dBaseWorkload( arm_compute::PoolingLayerInfo layerInfo = BuildArmComputePoolingLayerInfo(m_Data.m_Parameters); - // Run the layer + // Run the layer. m_PoolingLayer.configure(&input, &output, layerInfo); } -template class ClPooling2dBaseWorkload<DataType::Float32>; +template class ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>; template class ClPooling2dBaseWorkload<DataType::QuantisedAsymm8>; } diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp index 828f000505..aea32c9e86 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dBaseWorkload.hpp @@ -14,12 +14,12 @@ arm_compute::Status ClPooling2dWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const Pooling2dDescriptor& descriptor); -// Base class template providing an implementation of the Pooling2d layer common to all data types -template <armnn::DataType dataType> -class ClPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataType> +// Base class template providing an implementation of the Pooling2d layer common to all data types. +template <armnn::DataType... dataTypes> +class ClPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataTypes...> { public: - using TypedWorkload<Pooling2dQueueDescriptor, dataType>::m_Data; + using TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>::m_Data; ClPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name); diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp index a7f5855b8a..3a5b8ca526 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.cpp @@ -10,13 +10,13 @@ namespace armnn ClPooling2dFloat32Workload::ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : ClPooling2dBaseWorkload<DataType::Float32>(descriptor, info, "ClPooling2dFloat32Workload") + : ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32>(descriptor, info, "ClPooling2dFloat32Workload") { } void ClPooling2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dFloat32Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp index 3456a2cff8..ad189bdb52 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload<DataType::Float32> +class ClPooling2dFloat32Workload : public ClPooling2dBaseWorkload<DataType::Float16, DataType::Float32> { public: ClPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp index 2d2109e252..94cf753f5a 100644 --- a/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClPooling2dUint8Workload.cpp @@ -16,7 +16,7 @@ ClPooling2dUint8Workload::ClPooling2dUint8Workload(const Pooling2dQueueDescripto void ClPooling2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClPooling2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClPooling2dUint8Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp index 7b4ad4415b..05fba222ac 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.cpp @@ -11,7 +11,7 @@ namespace armnn { ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ReshapeQueueDescriptor>(descriptor, info) + : FloatWorkload<ReshapeQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClReshapeFloat32Workload", 1, 1); @@ -23,7 +23,7 @@ ClReshapeFloat32Workload::ClReshapeFloat32Workload(const ReshapeQueueDescriptor& void ClReshapeFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp index e344ee08ad..0eb4d08da0 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClReshapeFloat32Workload : public Float32Workload<ReshapeQueueDescriptor> +class ClReshapeFloat32Workload : public FloatWorkload<ReshapeQueueDescriptor> { public: ClReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp index 36cc1dec17..050fb9aa33 100644 --- a/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClReshapeUint8Workload.cpp @@ -21,7 +21,7 @@ ClReshapeUint8Workload::ClReshapeUint8Workload(const ReshapeQueueDescriptor& des void ClReshapeUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClReshapeUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClReshapeUint8Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp index d71011a2e3..abef682611 100644 --- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.cpp @@ -14,7 +14,7 @@ namespace armnn ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ResizeBilinearQueueDescriptor>(descriptor, info) + : FloatWorkload<ResizeBilinearQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("ClResizeBilinearFloat32Workload", 1, 1); @@ -28,7 +28,7 @@ ClResizeBilinearFloat32Workload::ClResizeBilinearFloat32Workload(const ResizeBil void ClResizeBilinearFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClResizeBilinearFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClResizeBilinearFloat32Workload_Execute"); m_ResizeBilinearLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp index 5f70e71619..81c0566bb3 100644 --- a/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClResizeBilinearFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class ClResizeBilinearFloat32Workload : public Float32Workload<ResizeBilinearQueueDescriptor> +class ClResizeBilinearFloat32Workload : public FloatWorkload<ResizeBilinearQueueDescriptor> { public: ClResizeBilinearFloat32Workload(const ResizeBilinearQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp new file mode 100644 index 0000000000..cd3107cfe1 --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.cpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ClSoftmaxBaseWorkload.hpp" + +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + // NOTE: We report 4D Softmax as unsupported until full support is added to ACL + if(input.GetShape().GetNumDimensions() >= 4u) + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported"); + } + + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + return arm_compute::CLSoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo); +} + +} diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp new file mode 100644 index 0000000000..e0113134af --- /dev/null +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxBaseWorkload.hpp @@ -0,0 +1,16 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status ClSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +} // namespace armnn diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp index 1d05172b42..08247bc593 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.cpp @@ -12,7 +12,7 @@ namespace armnn ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<SoftmaxQueueDescriptor>(descriptor, info) + : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info) , m_SoftmaxLayer(memoryManager) { m_Data.ValidateInputsOutputs("ClSoftmaxFloat32Workload", 1, 1); @@ -24,7 +24,7 @@ ClSoftmaxFloat32Workload::ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& void ClSoftmaxFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxFloat32Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp index cf5c45ac6f..6cad59800b 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class ClSoftmaxFloat32Workload : public Float32Workload<SoftmaxQueueDescriptor> +class ClSoftmaxFloat32Workload : public FloatWorkload<SoftmaxQueueDescriptor> { public: ClSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp index ee9ab4754b..3cd9a6a5ec 100644 --- a/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSoftmaxUint8Workload.cpp @@ -33,7 +33,7 @@ ClSoftmaxUint8Workload::ClSoftmaxUint8Workload(const SoftmaxQueueDescriptor& des void ClSoftmaxUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSoftmaxUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSoftmaxUint8Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp index 6221d56766..8a622c6caf 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void ClSplitterFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterFloat32Workload_Execute"); ClBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp index cfc7eaa3c2..affa9f840f 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class ClSplitterFloat32Workload : public ClBaseSplitterWorkload<DataType::Float32> +class ClSplitterFloat32Workload : public ClBaseSplitterWorkload<DataType::Float16, DataType::Float32> { public: - using ClBaseSplitterWorkload<DataType::Float32>::ClBaseSplitterWorkload; + using ClBaseSplitterWorkload<DataType::Float16, DataType::Float32>::ClBaseSplitterWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp index 3aa470894c..d2d25495e0 100644 --- a/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp +++ b/src/armnn/backends/ClWorkloads/ClSplitterUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void ClSplitterUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "ClSplitterUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_CL("ClSplitterUint8Workload_Execute"); ClBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/CpuTensorHandle.cpp b/src/armnn/backends/CpuTensorHandle.cpp index dd8176c9ec..78cf6efd2e 100644 --- a/src/armnn/backends/CpuTensorHandle.cpp +++ b/src/armnn/backends/CpuTensorHandle.cpp @@ -45,6 +45,12 @@ ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ConstTensor& tensor) CopyFrom(tensor.GetMemoryArea(), tensor.GetNumBytes()); } +ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ConstCpuTensorHandle& tensorHandle) +: ScopedCpuTensorHandle(tensorHandle.GetTensorInfo()) +{ + CopyFrom(tensorHandle.GetConstTensor<void>(), tensorHandle.GetTensorInfo().GetNumBytes()); +} + ScopedCpuTensorHandle::ScopedCpuTensorHandle(const ScopedCpuTensorHandle& other) : CpuTensorHandle(other.GetTensorInfo()) { diff --git a/src/armnn/backends/CpuTensorHandle.hpp b/src/armnn/backends/CpuTensorHandle.hpp index 4bf4439083..3376650ec3 100644 --- a/src/armnn/backends/CpuTensorHandle.hpp +++ b/src/armnn/backends/CpuTensorHandle.hpp @@ -9,10 +9,12 @@ #include "OutputHandler.hpp" +#include <algorithm> + namespace armnn { -// Abstract tensor handle wrapping a CPU-readable region of memory, interpreting it as tensor data. +// Abstract tensor handles wrapping a CPU-readable region of memory, interpreting it as tensor data. class ConstCpuTensorHandle : public ITensorHandle { public: @@ -33,6 +35,30 @@ public: return ITensorHandle::Cpu; } + virtual void Manage() override {} + + virtual ITensorHandle* GetParent() const override { return nullptr; } + + virtual const void* Map(bool /* blocking = true */) const override { return m_Memory; } + virtual void Unmap() const override {} + + TensorShape GetStrides() const override + { + TensorShape shape(m_TensorInfo.GetShape()); + auto size = GetDataTypeSize(m_TensorInfo.GetDataType()); + auto runningSize = size; + std::vector<unsigned int> strides(shape.GetNumDimensions()); + auto lastIdx = shape.GetNumDimensions()-1; + for (unsigned int i=0; i < lastIdx ; i++) + { + strides[lastIdx-i] = runningSize; + runningSize *= shape[lastIdx-i]; + } + strides[0] = runningSize; + return TensorShape(shape.GetNumDimensions(), strides.data()); + } + TensorShape GetShape() const override { return m_TensorInfo.GetShape(); } + protected: ConstCpuTensorHandle(const TensorInfo& tensorInfo); @@ -46,7 +72,7 @@ private: const void* m_Memory; }; -// Abstract specialization of ConstCpuTensorHandle that allows write access to the same data +// Abstract specialization of ConstCpuTensorHandle that allows write access to the same data. class CpuTensorHandle : public ConstCpuTensorHandle { public: @@ -79,9 +105,12 @@ class ScopedCpuTensorHandle : public CpuTensorHandle public: explicit ScopedCpuTensorHandle(const TensorInfo& tensorInfo); - // Copies contents from Tensor + // Copies contents from Tensor. explicit ScopedCpuTensorHandle(const ConstTensor& tensor); + // Copies contents from ConstCpuTensorHandle + explicit ScopedCpuTensorHandle(const ConstCpuTensorHandle& tensorHandle); + ScopedCpuTensorHandle(const ScopedCpuTensorHandle& other); ScopedCpuTensorHandle& operator=(const ScopedCpuTensorHandle& other); ~ScopedCpuTensorHandle(); @@ -98,7 +127,7 @@ private: // Clients must make sure the passed in memory region stays alive for the lifetime of // the PassthroughCpuTensorHandle instance. // -// Note there is no polymorphism to/from ConstPassthroughCpuTensorHandle +// Note there is no polymorphism to/from ConstPassthroughCpuTensorHandle. class PassthroughCpuTensorHandle : public CpuTensorHandle { public: @@ -117,7 +146,7 @@ public: // Clients must make sure the passed in memory region stays alive for the lifetime of // the PassthroughCpuTensorHandle instance. // -// Note there is no polymorphism to/from PassthroughCpuTensorHandle +// Note there is no polymorphism to/from PassthroughCpuTensorHandle. class ConstPassthroughCpuTensorHandle : public ConstCpuTensorHandle { public: @@ -131,7 +160,7 @@ public: }; -// template specializations +// Template specializations. template <> const void* ConstCpuTensorHandle::GetConstTensor() const; diff --git a/src/armnn/backends/ITensorHandle.hpp b/src/armnn/backends/ITensorHandle.hpp index b95dcc65e0..ab571ab305 100644 --- a/src/armnn/backends/ITensorHandle.hpp +++ b/src/armnn/backends/ITensorHandle.hpp @@ -7,6 +7,8 @@ namespace armnn { +class TensorShape; + class ITensorHandle { public: @@ -18,8 +20,54 @@ public: }; virtual ~ITensorHandle(){} + + /// Indicate to the memory manager that this resource is active. + /// This is used to compute overlapping lifetimes of resources. + virtual void Manage() = 0; + + /// Indicate to the memory manager that this resource is no longer active. + /// This is used to compute overlapping lifetimes of resources. virtual void Allocate() = 0; + + /// Get the type backend associated with the tensor handle. + /// \return Type enum virtual ITensorHandle::Type GetType() const = 0; + + /// Get the parent tensor if this is a subtensor. + /// \return a pointer to the parent tensor. Otherwise nullptr if not a subtensor. + virtual ITensorHandle* GetParent() const = 0; + + /// Map the tensor data for access. + /// \param blocking hint to block the calling thread until all other accesses are complete. (backend dependent) + /// \return pointer to the first element of the mapped data. + virtual const void* Map(bool blocking=true) const = 0; + + /// Unmap the tensor data + virtual void Unmap() const = 0; + + /// Map the tensor data for access. Must be paired with call to Unmap(). + /// \param blocking hint to block the calling thread until all other accesses are complete. (backend dependent) + /// \return pointer to the first element of the mapped data. + void* Map(bool blocking=true) + { + return const_cast<void*>(static_cast<const ITensorHandle*>(this)->Map(blocking)); + } + + /// Unmap the tensor data that was previously mapped with call to Map(). + void Unmap() + { + return static_cast<const ITensorHandle*>(this)->Unmap(); + } + + /// Get the strides for each dimension ordered from largest to smallest where + /// the smallest value is the same as the size of a single element in the tensor. + /// \return a TensorShape filled with the strides for each dimension + virtual TensorShape GetStrides() const = 0; + + /// Get the number of elements for each dimension orderd from slowest iterating dimension + /// to fastest iterating dimension. + /// \return a TensorShape filled with the number of elements for each dimension. + virtual TensorShape GetShape() const = 0; }; } diff --git a/src/armnn/backends/MakeWorkloadHelper.hpp b/src/armnn/backends/MakeWorkloadHelper.hpp index a1f9b0b0eb..64a7f8983b 100644 --- a/src/armnn/backends/MakeWorkloadHelper.hpp +++ b/src/armnn/backends/MakeWorkloadHelper.hpp @@ -9,7 +9,7 @@ namespace armnn namespace { -// Make a workload of the specified WorkloadType +// Make a workload of the specified WorkloadType. template<typename WorkloadType> struct MakeWorkloadForType { @@ -37,7 +37,8 @@ struct MakeWorkloadForType<NullWorkload> // Makes a workload for one the specified types based on the data type requirements of the tensorinfo. // Specify type void as the WorkloadType for unsupported DataType/WorkloadType combos. -template <typename Float32Workload, typename Uint8Workload, typename QueueDescriptorType, typename... Args> +template <typename Float16Workload, typename Float32Workload, typename Uint8Workload, typename QueueDescriptorType, + typename... Args> std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info, Args&&... args) { const DataType dataType = !info.m_InputTensorInfos.empty() ? @@ -49,6 +50,8 @@ std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, c switch (dataType) { + case DataType::Float16: + return MakeWorkloadForType<Float16Workload>::Func(descriptor, info, std::forward<Args>(args)...); case DataType::Float32: return MakeWorkloadForType<Float32Workload>::Func(descriptor, info, std::forward<Args>(args)...); case DataType::QuantisedAsymm8: @@ -59,5 +62,17 @@ std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, c } } +// Makes a workload for one the specified types based on the data type requirements of the tensorinfo. +// Calling this method is the equivalent of calling the three typed MakeWorkload method with <FloatWorkload, +// FloatWorkload, Uint8Workload>. +// Specify type void as the WorkloadType for unsupported DataType/WorkloadType combos. +template <typename FloatWorkload, typename Uint8Workload, typename QueueDescriptorType, typename... Args> +std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info, Args&&... args) +{ + return MakeWorkload<FloatWorkload, FloatWorkload, Uint8Workload>(descriptor, info, + std::forward<Args>(args)...); +} + + } //namespace } //namespace armnn diff --git a/src/armnn/backends/MemCopyWorkload.cpp b/src/armnn/backends/MemCopyWorkload.cpp index 09ffd9a08a..27e60f93b7 100644 --- a/src/armnn/backends/MemCopyWorkload.cpp +++ b/src/armnn/backends/MemCopyWorkload.cpp @@ -4,14 +4,7 @@ // #include "MemCopyWorkload.hpp" #include "backends/CpuTensorHandle.hpp" - -#if ARMCOMPUTECL_ENABLED -#include "backends/ClTensorHandle.hpp" -#endif - -#if ARMCOMPUTENEON_ENABLED -#include "backends/NeonTensorHandle.hpp" -#endif +#include "TypeUtils.hpp" #include <cstring> #include <boost/cast.hpp> @@ -26,7 +19,7 @@ template <typename SrcTensorHandleType, typename DstTensorHandleType> void GatherTensorHandlePairs(const MemCopyQueueDescriptor& descriptor, std::vector<std::pair<SrcTensorHandleType*, DstTensorHandleType*>>& tensorHandlePairs) { - const unsigned int numInputs = boost::numeric_cast<unsigned int>(descriptor.m_Inputs.size()); + const unsigned int numInputs = static_cast<unsigned int>(descriptor.m_Inputs.size()); tensorHandlePairs.reserve(numInputs); for (unsigned int i = 0; i < numInputs; ++i) @@ -40,217 +33,29 @@ void GatherTensorHandlePairs(const MemCopyQueueDescriptor& descriptor, } } -void CopyFromCpuToCpu(const ConstCpuTensorHandle& srcHandle, CpuTensorHandle& dstHandle) -{ - const unsigned int numBytes = srcHandle.GetTensorInfo().GetNumBytes(); - const void* const input = srcHandle.GetConstTensor<void>(); - void* const output = dstHandle.GetTensor<void>(); - std::memcpy(output, input, numBytes); -} - -#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED - -#include "backends/ArmComputeTensorUtils.hpp" - -template <armnn::DataType DataType> -void CopyFromCpuToAclBackend(const ConstCpuTensorHandle& srcHandle, arm_compute::ITensor& dstAclTensor) -{ - using T = ResolveType<DataType>; - armnn::armcomputetensorutils::CopyArmComputeITensorData(srcHandle.GetConstTensor<T>(), dstAclTensor); -} - -template <armnn::DataType DataType> -void CopyFromAclBackendToCpu(const arm_compute::ITensor& srcAclTensor, CpuTensorHandle& dstHandle) -{ - using T = ResolveType<DataType>; - armnn::armcomputetensorutils::CopyArmComputeITensorData(srcAclTensor, dstHandle.GetTensor<T>()); -} - -#endif // ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED - -} - -template <armnn::DataType DataType> -CopyFromCpuToCpuWorkload<DataType>::CopyFromCpuToCpuWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template <armnn::DataType DataType> -void CopyFromCpuToCpuWorkload<DataType>::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "CopyFromCpuToCpuWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - CopyFromCpuToCpu(*pair.first, *pair.second); - } -} - -template class CopyFromCpuToCpuWorkload<DataType::Float32>; -template class CopyFromCpuToCpuWorkload<DataType::QuantisedAsymm8>; - -#if ARMCOMPUTECL_ENABLED - -template <armnn::DataType DataType> -CopyFromCpuToClWorkload<DataType>::CopyFromCpuToClWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template <armnn::DataType DataType> -void CopyFromCpuToClWorkload<DataType>::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromCpuToClWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.second; - - handle.Map(true); - CopyFromCpuToAclBackend<DataType>(*pair.first, handle.GetTensor()); - handle.UnMap(); - } -} - -template class CopyFromCpuToClWorkload<DataType::Float32>; -template class CopyFromCpuToClWorkload<DataType::QuantisedAsymm8>; - - -template <armnn::DataType DataType> -CopyFromClToCpuWorkload<DataType>::CopyFromClToCpuWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template <armnn::DataType DataType> -void CopyFromClToCpuWorkload<DataType>::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromClToCpuWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.first; - - handle.Map(true); - CopyFromAclBackendToCpu<DataType>(handle.GetTensor(), *pair.second); - handle.UnMap(); - } -} - -template class CopyFromClToCpuWorkload<DataType::Float32>; -template class CopyFromClToCpuWorkload<DataType::QuantisedAsymm8>; - -#endif // ARMCOMPUTECL_ENABLED +} //namespace -#if ARMCOMPUTENEON_ENABLED -template <armnn::DataType DataType> -CopyFromCpuToNeonWorkload<DataType>::CopyFromCpuToNeonWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info) +CopyMemGenericWorkload::CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, + const WorkloadInfo& info) + : BaseWorkload<MemCopyQueueDescriptor>(descriptor, info) { GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); } -template <armnn::DataType DataType> -void CopyFromCpuToNeonWorkload<DataType>::Execute() const +void CopyMemGenericWorkload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "CopyFromCpuToNeonWorkload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT(Compute::Undefined, "CopyMemGeneric_Execute"); - for (const auto& pair : m_TensorHandlePairs) - { - CopyFromCpuToAclBackend<DataType>(*pair.first, pair.second->GetTensor()); - } -} - -template class CopyFromCpuToNeonWorkload<DataType::Float32>; -template class CopyFromCpuToNeonWorkload<DataType::QuantisedAsymm8>; - -template <armnn::DataType DataType> -CopyFromNeonToCpuWorkload<DataType>::CopyFromNeonToCpuWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template <armnn::DataType DataType> -void CopyFromNeonToCpuWorkload<DataType>::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "CopyFromNeonToCpuWorkload_Execute"); + auto copyFunc = [](void* dst, const void* src, size_t size) + { + memcpy(dst, src, size); + }; for (const auto& pair : m_TensorHandlePairs) { - CopyFromAclBackendToCpu<DataType>(pair.first->GetTensor(), *pair.second); + CopyTensorContentsGeneric(pair.first, pair.second, copyFunc); } } -template class CopyFromNeonToCpuWorkload<DataType::Float32>; -template class CopyFromNeonToCpuWorkload<DataType::QuantisedAsymm8>; - -#endif // ARMCOMPUTENEON_ENABLED - -#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED - -template <armnn::DataType DataType> -CopyFromNeonToClWorkload<DataType>::CopyFromNeonToClWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template <armnn::DataType DataType> -void CopyFromNeonToClWorkload<DataType>::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromNeonToClWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.second; - - handle.Map(true); - handle.GetTensor().copy_from(pair.first->GetTensor()); - handle.UnMap(); - } -} - -template class CopyFromNeonToClWorkload<DataType::Float32>; -template class CopyFromNeonToClWorkload<DataType::QuantisedAsymm8>; - -template <armnn::DataType DataType> -CopyFromClToNeonWorkload<DataType>::CopyFromClToNeonWorkload(const MemCopyQueueDescriptor& descriptor, - const WorkloadInfo& info) - : TypedWorkload<MemCopyQueueDescriptor, DataType>(descriptor, info) -{ - GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); -} - -template <armnn::DataType DataType> -void CopyFromClToNeonWorkload<DataType>::Execute() const -{ - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "CopyFromClToNeonWorkload_Execute"); - - for (const auto& pair : m_TensorHandlePairs) - { - IClTensorHandle& handle = *pair.first; - - handle.Map(true); - pair.second->GetTensor().copy_from(handle.GetTensor()); - handle.UnMap(); - } -} - -template class CopyFromClToNeonWorkload<DataType::Float32>; -template class CopyFromClToNeonWorkload<DataType::QuantisedAsymm8>; - -#endif // ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED - -} +} //namespace armnn diff --git a/src/armnn/backends/MemCopyWorkload.hpp b/src/armnn/backends/MemCopyWorkload.hpp index 7fcaf138c3..7a46e5b2ef 100644 --- a/src/armnn/backends/MemCopyWorkload.hpp +++ b/src/armnn/backends/MemCopyWorkload.hpp @@ -6,131 +6,21 @@ #include "CpuTensorHandleFwd.hpp" #include "backends/Workload.hpp" - +#include "WorkloadUtils.hpp" #include <utility> namespace armnn { -template <armnn::DataType DataType> -class CopyFromCpuToCpuWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType> -{ -public: - CopyFromCpuToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair<const ConstCpuTensorHandle*, CpuTensorHandle*>; - std::vector<TensorHandlePair> m_TensorHandlePairs; -}; - -using CopyFromCpuToCpuFloat32Workload = CopyFromCpuToCpuWorkload<DataType::Float32>; -using CopyFromCpuToCpuUint8Workload = CopyFromCpuToCpuWorkload<DataType::QuantisedAsymm8>; - -#if ARMCOMPUTECL_ENABLED - -class IClTensorHandle; - -template <armnn::DataType DataType> -class CopyFromCpuToClWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType> -{ -public: - CopyFromCpuToClWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair<const ConstCpuTensorHandle*, IClTensorHandle*>; - std::vector<TensorHandlePair> m_TensorHandlePairs; -}; - -using CopyFromCpuToClFloat32Workload = CopyFromCpuToClWorkload<DataType::Float32>; -using CopyFromCpuToClUint8Workload = CopyFromCpuToClWorkload<DataType::QuantisedAsymm8>; - -template <armnn::DataType DataType> -class CopyFromClToCpuWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType> -{ -public: - CopyFromClToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair<IClTensorHandle*, CpuTensorHandle*>; - std::vector<TensorHandlePair> m_TensorHandlePairs; -}; - -using CopyFromClToCpuFloat32Workload = CopyFromClToCpuWorkload<DataType::Float32>; -using CopyFromClToCpuUint8Workload = CopyFromClToCpuWorkload<DataType::QuantisedAsymm8>; - -#endif // ARMCOMPUTECL_ENABLED - -#if ARMCOMPUTENEON_ENABLED - -class INeonTensorHandle; - -template <armnn::DataType DataType> -class CopyFromCpuToNeonWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType> -{ -public: - CopyFromCpuToNeonWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -protected: - using TensorHandlePair = std::pair<const ConstCpuTensorHandle*, INeonTensorHandle*>; - std::vector<TensorHandlePair> m_TensorHandlePairs; -}; - -using CopyFromCpuToNeonFloat32Workload = CopyFromCpuToNeonWorkload<DataType::Float32>; -using CopyFromCpuToNeonUint8Workload = CopyFromCpuToNeonWorkload<DataType::QuantisedAsymm8>; - -template <armnn::DataType DataType> -class CopyFromNeonToCpuWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType> +class CopyMemGenericWorkload : public BaseWorkload<MemCopyQueueDescriptor> { public: - CopyFromNeonToCpuWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -protected: - using TensorHandlePair = std::pair<const INeonTensorHandle*, CpuTensorHandle*>; - std::vector<TensorHandlePair> m_TensorHandlePairs; -}; - -using CopyFromNeonToCpuFloat32Workload = CopyFromNeonToCpuWorkload<DataType::Float32>; -using CopyFromNeonToCpuUint8Workload = CopyFromNeonToCpuWorkload<DataType::QuantisedAsymm8>; - -#endif - -#if ARMCOMPUTECL_ENABLED && ARMCOMPUTENEON_ENABLED - -template <armnn::DataType DataType> -class CopyFromNeonToClWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType> -{ -public: - CopyFromNeonToClWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); + CopyMemGenericWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); void Execute() const override; private: - using TensorHandlePair = std::pair<const INeonTensorHandle*, IClTensorHandle*>; + using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>; std::vector<TensorHandlePair> m_TensorHandlePairs; }; -using CopyFromNeonToClFloat32Workload = CopyFromNeonToClWorkload<DataType::Float32>; -using CopyFromNeonToClUint8Workload = CopyFromNeonToClWorkload<DataType::QuantisedAsymm8>; - -template <armnn::DataType DataType> -class CopyFromClToNeonWorkload : public TypedWorkload<MemCopyQueueDescriptor, DataType> -{ -public: - CopyFromClToNeonWorkload(const MemCopyQueueDescriptor& descriptor, const WorkloadInfo& info); - void Execute() const override; - -private: - using TensorHandlePair = std::pair<IClTensorHandle*, INeonTensorHandle*>; - std::vector<TensorHandlePair> m_TensorHandlePairs; -}; - -using CopyFromClToNeonFloat32Workload = CopyFromClToNeonWorkload<DataType::Float32>; -using CopyFromClToNeonUint8Workload = CopyFromClToNeonWorkload<DataType::QuantisedAsymm8>; - -#endif - -} +} //namespace armnn diff --git a/src/armnn/backends/NeonLayerSupport.cpp b/src/armnn/backends/NeonLayerSupport.cpp index bfc84bd086..3aef4e60aa 100644 --- a/src/armnn/backends/NeonLayerSupport.cpp +++ b/src/armnn/backends/NeonLayerSupport.cpp @@ -15,34 +15,29 @@ #include <boost/core/ignore_unused.hpp> #ifdef ARMCOMPUTENEON_ENABLED +#include "NeonWorkloads/NeonAdditionFloat32Workload.hpp" +#include "NeonWorkloads/NeonActivationFloat32Workload.hpp" +#include "NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp" #include "NeonWorkloads/NeonConvolution2dBaseWorkload.hpp" -#include "NeonWorkloads/NeonPooling2dBaseWorkload.hpp" +#include "NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp" +#include "NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp" +#include "NeonWorkloads/NeonMultiplicationFloat32Workload.hpp" +#include "NeonWorkloads/NeonNormalizationFloat32Workload.hpp" +#include "NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp" #include "NeonWorkloads/NeonPermuteWorkload.hpp" +#include "NeonWorkloads/NeonPooling2dBaseWorkload.hpp" +#include "NeonWorkloads/NeonSoftmaxBaseWorkload.hpp" #endif using namespace boost; namespace armnn { -bool IsNeonActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters) -{ - if (parameters.m_Function != ActivationFunction::BoundedReLu) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Unsupported activation function, only BoundedReLu is supported)"; - } - - return false; - } - - return true; -} bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc) { // See arm_compute::NEDirectConvolutionLayer documentation for the supported cases, - // and complement with NEDirectConvolutionLayerKernel::configure() implementation + // and complement with NEDirectConvolutionLayerKernel::configure() implementation. // Only 1x1 is using direct convolution. Performance results and details are in: // https://jira.arm.com/browse/IVGCVSW-1003 @@ -60,15 +55,15 @@ bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convol conv2ddesc.m_PadTop > value || conv2ddesc.m_PadBottom > value; }; - // Supported sizes and padding + // Supported sizes and padding. const bool sizeAndPaddingSupported = - // Pad > 0 not supported for 1x1 weights + // Pad > 0 not supported for 1x1 weights. (weightInfo.GetShape()[2] == 1 && weightInfo.GetShape()[3] == 1 && !paddingLargerThan(desc, 0u)); const bool preferDirectConvolution = dataTypeSupported && strideSupported && sizeAndPaddingSupported && - // NEDirectConvolutionLayerKernel doesn't support NULL bias + // NEDirectConvolutionLayerKernel doesn't support NULL bias. desc.m_BiasEnabled; return preferDirectConvolution; } @@ -108,10 +103,10 @@ bool IsNeonBackendSupported(std::string* reasonIfUnsupported) #endif } -template<typename Float32Func, typename Uint8Func, typename ... Params> +template<typename FloatFunc, typename Uint8Func, typename ... Params> bool IsSupportedForDataTypeNeon(std::string* reasonIfUnsupported, DataType dataType, - Float32Func floatFuncPtr, + FloatFunc floatFuncPtr, Uint8Func uint8FuncPtr, Params&&... params) { @@ -119,6 +114,7 @@ bool IsSupportedForDataTypeNeon(std::string* reasonIfUnsupported, IsSupportedForDataTypeGeneric(reasonIfUnsupported, dataType, floatFuncPtr, + floatFuncPtr, uint8FuncPtr, std::forward<Params>(params)...); } @@ -144,43 +140,16 @@ inline bool IsWorkloadSupported(FuncType& func, std::string* reasonIfUnsupported #endif bool IsActivationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported) { ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<const ActivationDescriptor&>, - &IsNeonActivationUint8Supported, - descriptor); -} - -bool IsNeonDepthwiseConvolution2dDescParamsSupported(std::string* reasonIfUnsupported, - const DepthwiseConvolution2dDescriptor& parameters, - const TensorInfo& weights) -{ - ignore_unused(weights); - - if (parameters.m_StrideX < 1 || parameters.m_StrideX > 3) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "m_StrideX can only be 1, 2 or 3"; - } - return false; - } - - // weights.GetShape()[0] = channel multiplier - if (weights.GetShape()[0] != 1) - { - if (reasonIfUnsupported) - { - *reasonIfUnsupported = "Channel multiplier only supports the value 1 in the NEON backend"; - } - return false; - } - - return true; + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonActivationWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor); } bool IsAdditionSupportedNeon(const TensorInfo& input0, @@ -188,23 +157,31 @@ bool IsAdditionSupportedNeon(const TensorInfo& input0, const TensorInfo& output, std::string* reasonIfUnsupported) { - ignore_unused(input1); - ignore_unused(output); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input0.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonAdditionWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); } bool IsBatchNormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonBatchNormalizationValidate, + reasonIfUnsupported, + input, + output, + mean, + var, + beta, + gamma, + descriptor); } bool IsConstantSupportedNeon(const TensorInfo& output, @@ -233,27 +210,40 @@ bool IsConvolution2dSupportedNeon(const TensorInfo& input, } bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &IsNeonDepthwiseConvolution2dDescParamsSupported, - &IsNeonDepthwiseConvolution2dDescParamsSupported, - descriptor, - weights); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonDepthwiseConvolutionWorkloadValidate, + reasonIfUnsupported, + input, + output, + descriptor, + weights, + biases); } bool IsFullyConnectedSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + // At the moment U8 is unsupported + if (input.GetDataType() == DataType::QuantisedAsymm8) + { + return false; + } + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonFullyConnectedWorkloadValidate, + reasonIfUnsupported, + input, + output, + weights, + biases, + descriptor); } bool IsInputSupportedNeon(const TensorInfo& input, @@ -266,12 +256,10 @@ bool IsInputSupportedNeon(const TensorInfo& input, } bool IsL2NormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFunc<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonL2NormalizationWorkloadValidate, reasonIfUnsupported, input, output); } bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs, @@ -287,13 +275,14 @@ bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs, bool IsMultiplicationSupportedNeon(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported) { - ignore_unused(input1); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input0.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonMultiplicationWorkloadValidate, + reasonIfUnsupported, + input0, + input1, + output); } bool IsNormalizationSupportedNeon(const TensorInfo& input, @@ -301,11 +290,7 @@ bool IsNormalizationSupportedNeon(const TensorInfo& input, const NormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &IsNeonNormalizationDescParamsSupported, - &FalseFuncU8<const NormalizationDescriptor&>, - descriptor); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonNormalizationWorkloadValidate, reasonIfUnsupported, input, output, descriptor); } bool IsOutputSupportedNeon(const TensorInfo& output, @@ -341,14 +326,11 @@ bool IsResizeBilinearSupportedNeon(const TensorInfo& input, } bool IsSoftmaxSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported) { - ignore_unused(descriptor); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &TrueFunc<>); + FORWARD_WORKLOAD_VALIDATE_FUNC(NeonSoftmaxWorkloadValidate, reasonIfUnsupported, input, output, descriptor); } bool IsSplitterSupportedNeon(const TensorInfo& input, @@ -385,10 +367,72 @@ bool IsFloorSupportedNeon(const TensorInfo& input, std::string* reasonIfUnsupported) { ignore_unused(output); - return IsSupportedForDataTypeNeon(reasonIfUnsupported, - input.GetDataType(), - &TrueFunc<>, - &FalseFuncU8<>); + return IsNeonBackendSupported(reasonIfUnsupported) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &FalseFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>); +} + +bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(outputStateIn); + ignore_unused(cellStateIn); + ignore_unused(scratchBuffer); + ignore_unused(outputStateOut); + ignore_unused(cellStateOut); + ignore_unused(output); + ignore_unused(descriptor); + ignore_unused(inputToForgetWeights); + ignore_unused(inputToCellWeights); + ignore_unused(inputToOutputWeights); + ignore_unused(recurrentToForgetWeights); + ignore_unused(recurrentToCellWeights); + ignore_unused(recurrentToOutputWeights); + ignore_unused(forgetGateBias); + ignore_unused(cellBias); + ignore_unused(outputGateBias); + ignore_unused(inputToInputWeights); + ignore_unused(recurrentToInputWeights); + ignore_unused(cellToInputWeights); + ignore_unused(inputGateBias); + ignore_unused(projectionWeights); + ignore_unused(projectionBias); + ignore_unused(cellToForgetWeights); + ignore_unused(cellToOutputWeights); + return false; +} + +bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(output); + return true; +} + +bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(output); + return true; } } diff --git a/src/armnn/backends/NeonLayerSupport.hpp b/src/armnn/backends/NeonLayerSupport.hpp index ce2ecec459..6f9fe9c20e 100644 --- a/src/armnn/backends/NeonLayerSupport.hpp +++ b/src/armnn/backends/NeonLayerSupport.hpp @@ -11,14 +11,13 @@ namespace armnn { -bool IsNeonActivationUint8Supported(std::string* reasonIfUnsupported, const ActivationDescriptor& parameters); - bool IsNeonDirectConvolutionPreferred(const TensorInfo& weightInfo, const Convolution2dDescriptor& desc); bool IsNeonNormalizationDescParamsSupported(std::string* reasonIfUnsupported, const NormalizationDescriptor& parameters); bool IsActivationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported); @@ -32,6 +31,11 @@ bool IsAdditionSupportedNeon(const TensorInfo& input0, std::string* reasonIfUnsupported); bool IsBatchNormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -45,12 +49,18 @@ bool IsConvolution2dSupportedNeon(const TensorInfo& input, const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); + bool IsDepthwiseConvolutionSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); bool IsFullyConnectedSupportedNeon(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -58,6 +68,7 @@ bool IsInputSupportedNeon(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsL2NormalizationSupportedNeon(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs, @@ -66,6 +77,7 @@ bool IsMergerSupportedNeon(const std::vector<const TensorInfo*> inputs, bool IsMultiplicationSupportedNeon(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsNormalizationSupportedNeon(const TensorInfo& input, @@ -90,6 +102,7 @@ bool IsResizeBilinearSupportedNeon(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsSoftmaxSupportedNeon(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -108,4 +121,26 @@ bool IsFloorSupportedNeon(const TensorInfo& input, const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsLstmSupportedNeon(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp16ToFp32SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp32ToFp16SupportedNeon(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + } diff --git a/src/armnn/backends/NeonTensorHandle.hpp b/src/armnn/backends/NeonTensorHandle.hpp index 684a5e1bfc..3818d2c9b2 100644 --- a/src/armnn/backends/NeonTensorHandle.hpp +++ b/src/armnn/backends/NeonTensorHandle.hpp @@ -7,11 +7,14 @@ #include "OutputHandler.hpp" #include "ArmComputeTensorUtils.hpp" +#include <arm_compute/runtime/MemoryGroup.h> +#include <arm_compute/runtime/IMemoryGroup.h> #include <arm_compute/runtime/Tensor.h> #include <arm_compute/runtime/SubTensor.h> #include <arm_compute/core/TensorShape.h> #include <arm_compute/core/Coordinates.h> +#include <boost/polymorphic_pointer_cast.hpp> namespace armnn { @@ -22,6 +25,7 @@ public: virtual arm_compute::ITensor& GetTensor() = 0; virtual arm_compute::ITensor const& GetTensor() const = 0; virtual arm_compute::DataType GetDataType() const = 0; + virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) = 0; }; class NeonTensorHandle : public INeonTensorHandle @@ -34,47 +38,100 @@ public: arm_compute::ITensor& GetTensor() override { return m_Tensor; } arm_compute::ITensor const& GetTensor() const override { return m_Tensor; } + virtual void Allocate() override { armnn::armcomputetensorutils::InitialiseArmComputeTensorEmpty(m_Tensor); }; + virtual void Manage() override + { + BOOST_ASSERT(m_MemoryGroup != nullptr); + m_MemoryGroup->manage(&m_Tensor); + } + virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; } + virtual ITensorHandle* GetParent() const override { return nullptr; } + virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>& memoryGroup) override + { + m_MemoryGroup = boost::polymorphic_pointer_downcast<arm_compute::MemoryGroup>(memoryGroup); + } + + virtual const void* Map(bool /* blocking = true */) const override + { + return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override {} + + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } + private: arm_compute::Tensor m_Tensor; + std::shared_ptr<arm_compute::MemoryGroup> m_MemoryGroup; }; class NeonSubTensorHandle : public INeonTensorHandle { public: - NeonSubTensorHandle(arm_compute::ITensor& parent, - const arm_compute::TensorShape& shape, - const arm_compute::Coordinates& coords) - : m_Tensor(&parent, shape, coords) + NeonSubTensorHandle(INeonTensorHandle* parent, + const arm_compute::TensorShape& shape, + const arm_compute::Coordinates& coords) + : m_Tensor(&parent->GetTensor(), shape, coords) { + parentHandle = parent; } arm_compute::ITensor& GetTensor() override { return m_Tensor; } arm_compute::ITensor const& GetTensor() const override { return m_Tensor; } - virtual void Allocate() override - { - }; + + virtual void Allocate() override {} + virtual void Manage() override {} virtual ITensorHandle::Type GetType() const override { return ITensorHandle::Neon; } + virtual ITensorHandle* GetParent() const override { return parentHandle; } + virtual arm_compute::DataType GetDataType() const override { return m_Tensor.info()->data_type(); } + virtual void SetMemoryGroup(const std::shared_ptr<arm_compute::IMemoryGroup>&) override {} + + virtual const void* Map(bool /* blocking = true */) const override + { + return static_cast<const void*>(m_Tensor.buffer() + m_Tensor.info()->offset_first_element_in_bytes()); + } + virtual void Unmap() const override {} + + TensorShape GetStrides() const override + { + return armcomputetensorutils::GetStrides(m_Tensor.info()->strides_in_bytes()); + } + + TensorShape GetShape() const override + { + return armcomputetensorutils::GetShape(m_Tensor.info()->tensor_shape()); + } private: - arm_compute::SubTensor m_Tensor; + arm_compute::SubTensor m_Tensor; + ITensorHandle* parentHandle = nullptr; }; } diff --git a/src/armnn/backends/NeonWorkloadFactory.cpp b/src/armnn/backends/NeonWorkloadFactory.cpp index a17988de5a..6ea72f77cc 100644 --- a/src/armnn/backends/NeonWorkloadFactory.cpp +++ b/src/armnn/backends/NeonWorkloadFactory.cpp @@ -9,10 +9,13 @@ #ifdef ARMCOMPUTENEON_ENABLED #include "arm_compute/runtime/Allocator.h" + #include "MemCopyWorkload.hpp" #include "NeonTensorHandle.hpp" #include "NeonWorkloadUtils.hpp" #include "NeonWorkloads.hpp" + +#include "memory/IPoolManager.hpp" #endif #include "MakeWorkloadHelper.hpp" @@ -22,7 +25,8 @@ namespace armnn { -bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported) { return IWorkloadFactory::IsLayerSupported(Compute::CpuAcc, layer, dataType, outReasonIfUnsupported); } @@ -30,7 +34,7 @@ bool NeonWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType #ifdef ARMCOMPUTENEON_ENABLED NeonWorkloadFactory::NeonWorkloadFactory() -: m_MemoryManager(std::make_unique<arm_compute::Allocator>()) + : m_MemoryManager(std::make_unique<arm_compute::Allocator>(), BaseMemoryManager::MemoryAffinity::Offset) { } @@ -46,30 +50,33 @@ std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateSubTensorHandle(ITenso coords.set_num_dimensions(subTensorShape.GetNumDimensions()); for (unsigned int i = 0; i < subTensorShape.GetNumDimensions(); i++) { - // arm compute indexes tensor coords in reverse order + // Arm compute indexes tensor coords in reverse order. unsigned int revertedIndex = subTensorShape.GetNumDimensions() - i - 1; coords.set(i, boost::numeric_cast<int>(subTensorOrigin[revertedIndex])); } - return std::make_unique<NeonSubTensorHandle>(boost::polymorphic_downcast<INeonTensorHandle*>(&parent)->GetTensor(), - shape, coords); + return std::make_unique<NeonSubTensorHandle>( + boost::polymorphic_downcast<INeonTensorHandle*>(&parent), shape, coords); } std::unique_ptr<ITensorHandle> NeonWorkloadFactory::CreateTensorHandle(const TensorInfo& tensorInfo) const { - return std::make_unique<NeonTensorHandle>(tensorInfo); + auto tensorHandle = std::make_unique<NeonTensorHandle>(tensorInfo); + tensorHandle->SetMemoryGroup(m_MemoryManager.GetInterLayerMemoryGroup()); + + return tensorHandle; } std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateInput(const InputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<CopyFromCpuToNeonFloat32Workload, CopyFromCpuToNeonUint8Workload>(descriptor, info); + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<CopyFromNeonToCpuFloat32Workload, CopyFromNeonToCpuUint8Workload>(descriptor, info); + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor, @@ -82,7 +89,7 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSoftmax(const SoftmaxQueue const WorkloadInfo& info) const { return MakeWorkload<NeonSoftmaxFloat32Workload, NeonSoftmaxUint8Workload>(descriptor, info, - m_MemoryManager.Get()); + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateSplitter(const SplitterQueueDescriptor& descriptor, @@ -100,13 +107,14 @@ std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMerger(const Merger std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateFullyConnected( const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<NeonFullyConnectedFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload<NeonFullyConnectedFloat32Workload, NullWorkload>(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePermute(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<NeonPermuteFloat32Workload, NeonPermuteUint8Workload>(descriptor, info); + return MakeWorkload<NeonPermuteFloatWorkload, NeonPermuteUint8Workload>(descriptor, info); } std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreatePooling2d(const Pooling2dQueueDescriptor& descriptor, @@ -119,7 +127,7 @@ std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateConvolution2d( const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) const { return MakeWorkload<NeonConvolution2dFloat32Workload, NeonConvolution2dUint8Workload>(descriptor, info, - m_MemoryManager.Get()); + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDepthwiseConvolution2d( @@ -132,7 +140,8 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateDepthwiseConvolution2d( std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateNormalization( const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<NeonNormalizationFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload<NeonNormalizationFloat32Workload, NullWorkload>(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateAddition(const AdditionQueueDescriptor& descriptor, @@ -161,21 +170,7 @@ std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateMemCopy(const MemCo throw InvalidArgumentException("NeonWorkloadFactory: Invalid null input for MemCopy workload"); } - // Create a workload that will copy tensor data from the inputs, which can have a number of different formats, - // to Neon tensors. - switch (descriptor.m_Inputs[0]->GetType()) - { - case ITensorHandle::Cpu: - return MakeWorkload<CopyFromCpuToNeonFloat32Workload, CopyFromCpuToNeonUint8Workload>(descriptor, info); -#if ARMCOMPUTECL_ENABLED - case ITensorHandle::CL: - { - return MakeWorkload<CopyFromClToNeonFloat32Workload, CopyFromClToNeonUint8Workload>(descriptor, info); - } -#endif - default: - throw InvalidArgumentException("NeonWorkloadFactory: Destination type not supported for MemCopy Workload."); - } + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<armnn::IWorkload> NeonWorkloadFactory::CreateResizeBilinear( @@ -195,7 +190,8 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFakeQuantization( std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateL2Normalization(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) const { - return MakeWorkload<NeonL2NormalizationFloat32Workload, NullWorkload>(descriptor, info, m_MemoryManager.Get()); + return MakeWorkload<NeonL2NormalizationFloat32Workload, NullWorkload>(descriptor, info, + m_MemoryManager.GetIntraLayerManager()); } std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConstant(const ConstantQueueDescriptor& descriptor, @@ -216,11 +212,41 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDesc return MakeWorkload<NeonFloorFloat32Workload, NullWorkload>(descriptor, info); } +std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return MakeWorkload<NeonLstmFloat32Workload, NullWorkload>(descriptor, info); +} + +std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique<NeonConvertFp16ToFp32Workload>(descriptor, info); +} + +std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique<NeonConvertFp32ToFp16Workload>(descriptor, info); +} + void NeonWorkloadFactory::Finalize() { m_MemoryManager.Finalize(); } +void NeonWorkloadFactory::Release() +{ + m_MemoryManager.Release(); +} + +void NeonWorkloadFactory::Acquire() +{ + m_MemoryManager.Acquire(); +} + #else // Compiled without ArmCompute libs NeonWorkloadFactory::NeonWorkloadFactory() @@ -371,9 +397,35 @@ std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateFloor(const FloorQueueDesc return nullptr; } +std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + +std::unique_ptr<IWorkload> NeonWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return nullptr; +} + void NeonWorkloadFactory::Finalize() {} +void NeonWorkloadFactory::Release() +{} + +void NeonWorkloadFactory::Acquire() +{} + #endif } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloadFactory.hpp b/src/armnn/backends/NeonWorkloadFactory.hpp index 66a69f3baf..83e1f5e75f 100644 --- a/src/armnn/backends/NeonWorkloadFactory.hpp +++ b/src/armnn/backends/NeonWorkloadFactory.hpp @@ -4,15 +4,17 @@ // #pragma once -#include "AclBaseMemoryManager.hpp" #include "OutputHandler.hpp" +#include "memory/BaseMemoryManager.hpp" + #include <boost/core/ignore_unused.hpp> +#include <boost/optional.hpp> namespace armnn { -// Neon workload factory +// Neon workload factory. class NeonWorkloadFactory : public IWorkloadFactory { public: @@ -20,7 +22,8 @@ public: virtual Compute GetCompute() const override { return Compute::CpuAcc; } - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const override { return true; } @@ -96,11 +99,25 @@ public: virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const override; - void Finalize() override; + virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const override; -private: + virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual void Finalize() override; - mutable AclBaseMemoryManager m_MemoryManager; + virtual void Release() override; + + virtual void Acquire() override; + +private: +#ifdef ARMCOMPUTENEON_ENABLED + mutable NeonMemoryManager m_MemoryManager; +#endif }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloadUtils.cpp b/src/armnn/backends/NeonWorkloadUtils.cpp index e807d23d6c..07e5d510eb 100644 --- a/src/armnn/backends/NeonWorkloadUtils.cpp +++ b/src/armnn/backends/NeonWorkloadUtils.cpp @@ -20,13 +20,14 @@ #include "NeonLayerSupport.hpp" #include "../../../include/armnn/Types.hpp" +#include "Half.hpp" using namespace armnn::armcomputetensorutils; namespace armnn { -// Allocate a tensor and copy the contents in data to the tensor contents +// Allocates a tensor and copy the contents in data to the tensor contents. template<typename T> void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data) { @@ -34,8 +35,26 @@ void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data) CopyArmComputeITensorData(data, tensor); } +template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const Half* data); template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const float* data); template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const uint8_t* data); template void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const int32_t* data); +void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor, + const ConstCpuTensorHandle* handle) +{ + BOOST_ASSERT(handle); + switch(handle->GetTensorInfo().GetDataType()) + { + case DataType::Float16: + InitialiseArmComputeTensorData(tensor, handle->GetConstTensor<Half>()); + break; + case DataType::Float32: + InitialiseArmComputeTensorData(tensor, handle->GetConstTensor<float>()); + break; + default: + BOOST_ASSERT_MSG(false, "Unexpected floating point type."); + } +}; + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloadUtils.hpp b/src/armnn/backends/NeonWorkloadUtils.hpp index ec7688237a..8169f8636a 100644 --- a/src/armnn/backends/NeonWorkloadUtils.hpp +++ b/src/armnn/backends/NeonWorkloadUtils.hpp @@ -7,6 +7,7 @@ #include "Workload.hpp" #include "backends/NeonTensorHandle.hpp" +#include "NeonTimer.hpp" #include "arm_compute/core/Types.h" #include "arm_compute/core/Helpers.h" @@ -22,4 +23,12 @@ class Layer; template<typename T> void InitialiseArmComputeTensorData(arm_compute::Tensor& tensor, const T* data); +void InitializeArmComputeTensorDataForFloatTypes(arm_compute::Tensor& tensor, const ConstCpuTensorHandle* handle); } //namespace armnn + + +#define ARMNN_SCOPED_PROFILING_EVENT_NEON(name) \ + ARMNN_SCOPED_PROFILING_EVENT_WITH_INSTRUMENTS(armnn::Compute::CpuAcc, \ + name, \ + armnn::WallClockTimer(), \ + armnn::NeonTimer()) diff --git a/src/armnn/backends/NeonWorkloads.hpp b/src/armnn/backends/NeonWorkloads.hpp index 83a3e9fd9b..9619b4e5c9 100644 --- a/src/armnn/backends/NeonWorkloads.hpp +++ b/src/armnn/backends/NeonWorkloads.hpp @@ -13,6 +13,8 @@ #include "backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonConstantFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonConstantUint8Workload.hpp" +#include "backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp" +#include "backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp" #include "backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp" #include "backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonConvolution2dUint8Workload.hpp" @@ -21,6 +23,7 @@ #include "backends/NeonWorkloads/NeonFloorFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp" +#include "backends/NeonWorkloads/NeonLstmFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonMergerFloat32Workload.hpp" #include "backends/NeonWorkloads/NeonMergerUint8Workload.hpp" #include "backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp" diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp index 39e55d5761..711bfceeaf 100644 --- a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.cpp @@ -9,9 +9,32 @@ namespace armnn { + +arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + const arm_compute::ActivationLayerInfo activationLayerInfo = + ConvertActivationDescriptorToAclActivationLayerInfo(descriptor); + + if (input.GetDataType() == DataType::QuantisedAsymm8 && + activationLayerInfo.activation() == arm_compute::ActivationLayerInfo::ActivationFunction::LOGISTIC) + { + return arm_compute::Status{arm_compute::ErrorCode::RUNTIME_ERROR, + "Neon: Logistic Activations unsupported with QAsymm8 data type."}; + } + + return arm_compute::NEActivationLayer::validate(&aclInput, + &aclOutput, + activationLayerInfo); +} + NeonActivationFloat32Workload::NeonActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ActivationQueueDescriptor>(descriptor, info) + : FloatWorkload<ActivationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("NeonActivationFloat32Workload", 1, 1); @@ -26,7 +49,7 @@ NeonActivationFloat32Workload::NeonActivationFloat32Workload(const ActivationQue void NeonActivationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonActivationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationFloat32Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp index 6fa83ea2f6..0d26b3b39f 100644 --- a/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonActivationFloat32Workload.hpp @@ -9,7 +9,12 @@ namespace armnn { -class NeonActivationFloat32Workload : public Float32Workload<ActivationQueueDescriptor> + +arm_compute::Status NeonActivationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const ActivationDescriptor& descriptor); + +class NeonActivationFloat32Workload : public FloatWorkload<ActivationQueueDescriptor> { public: NeonActivationFloat32Workload(const ActivationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp index 27c37e9425..f2e42338b2 100644 --- a/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonActivationUint8Workload.cpp @@ -13,15 +13,8 @@ NeonActivationUint8Workload::NeonActivationUint8Workload(const ActivationQueueDe const WorkloadInfo& info) : Uint8Workload<ActivationQueueDescriptor>(descriptor, info) { - - std::string reasonIfUnsupported; - if (!IsNeonActivationUint8Supported(&reasonIfUnsupported, m_Data.m_Parameters)) - { - throw InvalidArgumentException(reasonIfUnsupported); - } - - // Only BoundedReLu is supported (see IsNeonActivationUint8Supported) - arm_compute::ActivationLayerInfo layerInfo(arm_compute::ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + auto activation = ConvertActivationFunctionToAclActivationFunction(m_Data.m_Parameters.m_Function); + arm_compute::ActivationLayerInfo layerInfo(activation, m_Data.m_Parameters.m_A, m_Data.m_Parameters.m_B); @@ -35,7 +28,7 @@ NeonActivationUint8Workload::NeonActivationUint8Workload(const ActivationQueueDe void NeonActivationUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonActivationUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonActivationUint8Workload_Execute"); m_ActivationLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp index d1fb64093d..f26e42aff9 100644 --- a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.cpp @@ -4,14 +4,30 @@ // #include "NeonAdditionFloat32Workload.hpp" +#include "backends/ArmComputeTensorUtils.hpp" #include "backends/CpuTensorHandle.hpp" namespace armnn { +arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput0 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + return arm_compute::NEArithmeticAddition::validate(&aclInput0, + &aclInput1, + &aclOutput, + arm_compute::ConvertPolicy::SATURATE); +} + + NeonAdditionFloat32Workload::NeonAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<AdditionQueueDescriptor>(descriptor, info) + : FloatWorkload<AdditionQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("NeonAdditionFloat32Workload", 2, 1); @@ -24,7 +40,7 @@ NeonAdditionFloat32Workload::NeonAdditionFloat32Workload(const AdditionQueueDesc void NeonAdditionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonAdditionFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonAdditionFloat32Workload_Execute"); m_AddLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp index 5b75b502a3..dae66bb69d 100644 --- a/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonAdditionFloat32Workload.hpp @@ -9,7 +9,12 @@ namespace armnn { -class NeonAdditionFloat32Workload : public Float32Workload<AdditionQueueDescriptor> + +arm_compute::Status NeonAdditionWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output); + +class NeonAdditionFloat32Workload : public FloatWorkload<AdditionQueueDescriptor> { public: NeonAdditionFloat32Workload(const AdditionQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp index 247ebfc5dd..e0ad408424 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBaseConstantWorkload.hpp @@ -5,23 +5,27 @@ #pragma once +#include <arm_compute/core/Types.h> #include <backends/ArmComputeTensorUtils.hpp> #include <backends/CpuTensorHandle.hpp> #include <backends/NeonTensorHandle.hpp> +#include <backends/NeonWorkloadUtils.hpp> #include <backends/Workload.hpp> +#include <Half.hpp> #include <boost/cast.hpp> +#include "Half.hpp" namespace armnn { -// Base class template providing an implementation of the Constant layer common to all data types -template <armnn::DataType DataFormat> -class NeonBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataFormat> +// Base class template providing an implementation of the Constant layer common to all data types. +template <armnn::DataType... DataFormats> +class NeonBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataFormats...> { public: NeonBaseConstantWorkload(const ConstantQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload<ConstantQueueDescriptor, DataFormat>(descriptor, info) + : TypedWorkload<ConstantQueueDescriptor, DataFormats...>(descriptor, info) , m_RanOnce(false) { } @@ -41,15 +45,22 @@ public: BOOST_ASSERT(data.m_LayerOutput != nullptr); arm_compute::ITensor& output = boost::polymorphic_downcast<NeonTensorHandle*>(data.m_Outputs[0])->GetTensor(); + arm_compute::DataType computeDataType = + boost::polymorphic_downcast<NeonTensorHandle*>(data.m_Outputs[0])->GetDataType(); - switch (DataFormat) + switch (computeDataType) { - case DataType::Float32: + case arm_compute::DataType::F16: + { + CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<Half>(), output); + break; + } + case arm_compute::DataType::F32: { CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<float>(), output); break; } - case DataType::QuantisedAsymm8: + case arm_compute::DataType::QASYMM8: { CopyArmComputeITensorData(data.m_LayerOutput->GetConstTensor<uint8_t>(), output); break; diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp index 24640c7adb..6a87d62320 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBaseMergerWorkload.hpp @@ -5,20 +5,21 @@ #pragma once +#include <backends/NeonWorkloadUtils.hpp> #include <backends/Workload.hpp> namespace armnn { -// Base class template providing an implementation of the Merger layer common to all data types -template <armnn::DataType DataType> -class NeonBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataType> +// Base class template providing an implementation of the Merger layer common to all data types. +template <armnn::DataType... DataTypes> +class NeonBaseMergerWorkload : public TypedWorkload<MergerQueueDescriptor, DataTypes...> { public: - using TypedWorkload<MergerQueueDescriptor, DataType>::TypedWorkload; + using TypedWorkload<MergerQueueDescriptor, DataTypes...>::TypedWorkload; virtual void Execute() const override { - // With subtensors, merger is a no-op + // With subtensors, merger is a no-op. } }; diff --git a/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp index 769905b48b..769291c700 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBaseSplitterWorkload.hpp @@ -6,20 +6,21 @@ #pragma once #include <backends/Workload.hpp> +#include <backends/NeonWorkloadUtils.hpp> namespace armnn { -// Base class template providing an implementation of the Splitter layer common to all data types -template <armnn::DataType DataType> -class NeonBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataType> +// Base class template providing an implementation of the Splitter layer common to all data types. +template <armnn::DataType... DataTypes> +class NeonBaseSplitterWorkload : public TypedWorkload<SplitterQueueDescriptor, DataTypes...> { public: - using TypedWorkload<SplitterQueueDescriptor, DataType>::TypedWorkload; + using TypedWorkload<SplitterQueueDescriptor, DataTypes...>::TypedWorkload; virtual void Execute() const override { - // With subtensors, splitter is a no-op + // With subtensors, splitter is a no-op. } }; diff --git a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp index f107c8137f..ca5c8202cd 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.cpp @@ -6,40 +6,91 @@ #include "NeonBatchNormalizationFloat32Workload.hpp" #include "backends/CpuTensorHandle.hpp" #include "backends/ArmComputeTensorUtils.hpp" +#include "../../../../include/armnn/ArmNN.hpp" namespace armnn { using namespace armcomputetensorutils; + +arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInputInfo = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclMeanInfo = BuildArmComputeTensorInfo(mean); + const arm_compute::TensorInfo aclVarInfo = BuildArmComputeTensorInfo(var); + const arm_compute::TensorInfo aclBetaInfo = BuildArmComputeTensorInfo(beta); + const arm_compute::TensorInfo aclGammaInfo = BuildArmComputeTensorInfo(gamma); + + return arm_compute::NEBatchNormalizationLayer::validate(&aclInputInfo, + &aclOutputInfo, + &aclMeanInfo, + &aclVarInfo, + &aclBetaInfo, + &aclGammaInfo, + descriptor.m_Eps); +} + NeonBatchNormalizationFloat32Workload::NeonBatchNormalizationFloat32Workload( const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<BatchNormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<BatchNormalizationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("NeonBatchNormalizationFloat32Workload", 1, 1); arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - BuildArmComputeTensor(m_Mean, m_Data.m_Mean->GetTensorInfo()); - BuildArmComputeTensor(m_Variance, m_Data.m_Variance->GetTensorInfo()); - BuildArmComputeTensor(m_Gamma, m_Data.m_Gamma->GetTensorInfo()); - BuildArmComputeTensor(m_Beta, m_Data.m_Beta->GetTensorInfo()); + m_Mean = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_Mean, m_Data.m_Mean->GetTensorInfo()); + + m_Variance = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_Variance, m_Data.m_Variance->GetTensorInfo()); - m_Layer.configure( - &input, &output, &m_Mean, &m_Variance, &m_Beta, &m_Gamma, m_Data.m_Parameters.m_Eps); + m_Gamma = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_Gamma, m_Data.m_Gamma->GetTensorInfo()); - InitialiseArmComputeTensorData(m_Mean, m_Data.m_Mean->GetConstTensor<float>()); - InitialiseArmComputeTensorData(m_Variance, m_Data.m_Variance->GetConstTensor<float>()); - InitialiseArmComputeTensorData(m_Gamma, m_Data.m_Gamma->GetConstTensor<float>()); - InitialiseArmComputeTensorData(m_Beta, m_Data.m_Beta->GetConstTensor<float>()); + m_Beta = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_Beta, m_Data.m_Beta->GetTensorInfo()); + + m_Layer.configure(&input, + &output, + m_Mean.get(), + m_Variance.get(), + m_Beta.get(), + m_Gamma.get(), + m_Data.m_Parameters.m_Eps); + + InitializeArmComputeTensorDataForFloatTypes(*m_Mean, m_Data.m_Mean); + InitializeArmComputeTensorDataForFloatTypes(*m_Variance, m_Data.m_Variance); + InitializeArmComputeTensorDataForFloatTypes(*m_Gamma, m_Data.m_Gamma); + InitializeArmComputeTensorDataForFloatTypes(*m_Beta, m_Data.m_Beta); + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_Layer.prepare(); + FreeUnusedTensors(); } void NeonBatchNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonBatchNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonBatchNormalizationFloat32Workload_Execute"); m_Layer.run(); } +void NeonBatchNormalizationFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_Mean); + FreeTensorIfUnused(m_Variance); + FreeTensorIfUnused(m_Gamma); + FreeTensorIfUnused(m_Beta); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp index 2050d42859..5eb5601f26 100644 --- a/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonBatchNormalizationFloat32Workload.hpp @@ -10,7 +10,15 @@ namespace armnn { -class NeonBatchNormalizationFloat32Workload : public Float32Workload<BatchNormalizationQueueDescriptor> +arm_compute::Status NeonBatchNormalizationValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, + const BatchNormalizationDescriptor& descriptor); + +class NeonBatchNormalizationFloat32Workload : public FloatWorkload<BatchNormalizationQueueDescriptor> { public: NeonBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, @@ -20,10 +28,12 @@ public: private: mutable arm_compute::NEBatchNormalizationLayer m_Layer; - arm_compute::Tensor m_Mean; - arm_compute::Tensor m_Variance; - arm_compute::Tensor m_Gamma; - arm_compute::Tensor m_Beta; + std::unique_ptr<arm_compute::Tensor> m_Mean; + std::unique_ptr<arm_compute::Tensor> m_Variance; + std::unique_ptr<arm_compute::Tensor> m_Gamma; + std::unique_ptr<arm_compute::Tensor> m_Beta; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp index 8b203fbf3a..4e5d570a8e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonConstantFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConstantFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantFloat32Workload_Execute"); NeonBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp index 4ea4dfe127..050954df24 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonConstantFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class NeonConstantFloat32Workload : public NeonBaseConstantWorkload<DataType::Float32> +class NeonConstantFloat32Workload : public NeonBaseConstantWorkload<DataType::Float16, DataType::Float32> { public: - using NeonBaseConstantWorkload<DataType::Float32>::NeonBaseConstantWorkload; + using NeonBaseConstantWorkload<DataType::Float16, DataType::Float32>::NeonBaseConstantWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp index f6dfaeb7a7..4061605bc1 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConstantUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonConstantUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConstantUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConstantUint8Workload_Execute"); NeonBaseConstantWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp new file mode 100644 index 0000000000..84fc051f65 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.cpp @@ -0,0 +1,41 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonConvertFp16ToFp32Workload.hpp" +#include "Half.hpp" +#include "FloatingPointConverter.hpp" + +#include "backends/WorkloadUtils.hpp" + +namespace armnn +{ + +NeonConvertFp16ToFp32Workload::NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) + : Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("NeonConvertFp16ToFp32Workload", 1, 1); + GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); +} + +void NeonConvertFp16ToFp32Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp16ToFp32Workload_Execute"); + + auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size) + { + auto input = reinterpret_cast<const Half*>(src); + auto output = reinterpret_cast<float*>(dst); + size_t numElements = size/2; // 2 bytes per fp16 + armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output); + }; + + for (const auto& pair : m_TensorHandlePairs) + { + CopyTensorContentsGeneric(pair.first, pair.second, convertFunc); + } +} + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp new file mode 100644 index 0000000000..136c0d8a76 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp16ToFp32Workload.hpp @@ -0,0 +1,26 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +class NeonConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor> +{ +public: + NeonConvertFp16ToFp32Workload(const ConvertFp16ToFp32QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>; + std::vector<TensorHandlePair> m_TensorHandlePairs; +}; + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp new file mode 100644 index 0000000000..61f30522a8 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.cpp @@ -0,0 +1,43 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonConvertFp32ToFp16Workload.hpp" + +#include "Half.hpp" +#include "FloatingPointConverter.hpp" + +#include "Profiling.hpp" +#include "backends/WorkloadUtils.hpp" + +namespace armnn +{ + +NeonConvertFp32ToFp16Workload::NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) + : Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>(descriptor, info) +{ + this->m_Data.ValidateInputsOutputs("NeonConvertFp32ToFp16Workload", 1, 1); + GatherTensorHandlePairs(descriptor, m_TensorHandlePairs); +} + +void NeonConvertFp32ToFp16Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvertFp32ToFp16Workload_Execute"); + + auto convertFunc = [](uint8_t* dst, const uint8_t* src, size_t size) + { + auto input = reinterpret_cast<const float*>(src); + auto output = reinterpret_cast<Half*>(dst); + size_t numElements = size/2; // 2 bytes per fp16 + armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output); + }; + + for (const auto& pair : m_TensorHandlePairs) + { + CopyTensorContentsGeneric(pair.first, pair.second, convertFunc); + } +} + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp new file mode 100644 index 0000000000..f48c365c48 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonConvertFp32ToFp16Workload.hpp @@ -0,0 +1,26 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +class NeonConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor> +{ +public: + NeonConvertFp32ToFp16Workload(const ConvertFp32ToFp16QueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; + +private: + using TensorHandlePair = std::pair<const ITensorHandle*, ITensorHandle*>; + std::vector<TensorHandlePair> m_TensorHandlePairs; +}; + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp index 423f02bcb0..e76afb6cf7 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.cpp @@ -9,6 +9,9 @@ #include "NeonConvolution2dBaseWorkload.hpp" +#include "armnn/Types.hpp" +#include "Half.hpp" + namespace armnn { @@ -41,28 +44,28 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, layerInfo); } -template<armnn::DataType dataType> -NeonConvolution2dBaseWorkload<dataType>::NeonConvolution2dBaseWorkload(const Convolution2dQueueDescriptor& descriptor, - const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : TypedWorkload<Convolution2dQueueDescriptor, dataType>(descriptor, info) +template<armnn::DataType... dataTypes> +NeonConvolution2dBaseWorkload<dataTypes...>::NeonConvolution2dBaseWorkload( + const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, + std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) + : TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>(descriptor, info) { using arm_compute::NEDirectConvolutionLayer; - using namespace armcomputetensorutils; ValidateData(); - // todo: check tensor shapes match + // todo: check tensor shapes match. arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - BuildArmComputeTensor(m_KernelTensor, m_Data.m_Weight->GetTensorInfo()); + m_KernelTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_KernelTensor, m_Data.m_Weight->GetTensorInfo()); - arm_compute::Tensor* optionalBiasTensor = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBiasTensor = &m_BiasTensor; + m_BiasTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, @@ -81,8 +84,8 @@ NeonConvolution2dBaseWorkload<dataType>::NeonConvolution2dBaseWorkload(const Con { auto directConvolutionLayer = std::make_unique<arm_compute::NEDirectConvolutionLayer>(memoryManager); directConvolutionLayer->configure(&input, - &m_KernelTensor, - optionalBiasTensor, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); m_ConvolutionLayer.reset(directConvolutionLayer.release()); @@ -91,22 +94,50 @@ NeonConvolution2dBaseWorkload<dataType>::NeonConvolution2dBaseWorkload(const Con { auto convolutionLayer = std::make_unique<arm_compute::NEConvolutionLayer>(memoryManager); convolutionLayer->configure(&input, - &m_KernelTensor, - optionalBiasTensor, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); m_ConvolutionLayer.reset(convolutionLayer.release()); } BOOST_ASSERT(m_ConvolutionLayer); - using Type = ResolveType<dataType>; + armnn::DataType dataType = m_Data.m_Weight->GetTensorInfo().GetDataType(); + + switch (dataType) + { + case DataType::Float16: + { + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<Half>()); + break; + } + case DataType::Float32: + { + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<float>()); + break; + } + case DataType::QuantisedAsymm8: + { + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->template GetConstTensor<uint8_t>()); + break; + } + default: + { + BOOST_ASSERT_MSG(false, "Unknown DataType."); + } + } +} - InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->template GetConstTensor<Type>()); +template<armnn::DataType... dataTypes> +void NeonConvolution2dBaseWorkload<dataTypes...>::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); } -// Generate known implementations for linker -template class NeonConvolution2dBaseWorkload<DataType::Float32>; -template class NeonConvolution2dBaseWorkload<DataType::QuantisedAsymm8>; +// Generates known implementations for linker. +template class NeonConvolution2dBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>; +template class NeonConvolution2dBaseWorkload<armnn::DataType::QuantisedAsymm8>; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp index d28d50d819..524d2c90b6 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dBaseWorkload.hpp @@ -25,11 +25,11 @@ arm_compute::Status NeonConvolution2dWorkloadValidate(const TensorInfo& input, const TensorInfo& weights, const TensorInfo& biases); -template<armnn::DataType dataType> -class NeonConvolution2dBaseWorkload : public TypedWorkload<Convolution2dQueueDescriptor, dataType> +template<armnn::DataType... dataTypes> +class NeonConvolution2dBaseWorkload : public TypedWorkload<Convolution2dQueueDescriptor, dataTypes...> { public: - using TypedWorkload<Convolution2dQueueDescriptor, dataType>::m_Data; + using TypedWorkload<Convolution2dQueueDescriptor, dataTypes...>::m_Data; NeonConvolution2dBaseWorkload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager); @@ -38,8 +38,11 @@ public: protected: std::unique_ptr<arm_compute::IFunction> m_ConvolutionLayer; - arm_compute::Tensor m_KernelTensor; - arm_compute::Tensor m_BiasTensor; + + std::unique_ptr<arm_compute::Tensor> m_KernelTensor; + std::unique_ptr<arm_compute::Tensor> m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp index f20f2a4ac5..18ec6ca2e7 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.cpp @@ -18,13 +18,16 @@ NeonConvolution2dFloat32Workload::NeonConvolution2dFloat32Workload(const Convolu { if (m_Data.m_Parameters.m_BiasEnabled) { - InitialiseArmComputeTensorData(m_BiasTensor, m_Data.m_Bias->template GetConstTensor<float>()); + InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); } + + m_ConvolutionLayer->prepare(); + FreeUnusedTensors(); } void NeonConvolution2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConvolution2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dFloat32Workload_Execute"); m_ConvolutionLayer->run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp index 56b0848efa..0bb8d69d94 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dFloat32Workload.hpp @@ -15,7 +15,7 @@ namespace armnn { -class NeonConvolution2dFloat32Workload : public NeonConvolution2dBaseWorkload<DataType::Float32> +class NeonConvolution2dFloat32Workload : public NeonConvolution2dBaseWorkload<DataType::Float16, DataType::Float32> { public: NeonConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp index fb91f7b7b2..bb33e939ea 100644 --- a/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonConvolution2dUint8Workload.cpp @@ -14,14 +14,16 @@ NeonConvolution2dUint8Workload::NeonConvolution2dUint8Workload(const Convolution { if (m_Data.m_Parameters.m_BiasEnabled) { - InitialiseArmComputeTensorData(m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>()); + InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->template GetConstTensor<int32_t>()); } -} + m_ConvolutionLayer->prepare(); + FreeUnusedTensors(); +} void NeonConvolution2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonConvolution2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonConvolution2dUint8Workload_Execute"); m_ConvolutionLayer->run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp new file mode 100644 index 0000000000..58d6061537 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.cpp @@ -0,0 +1,46 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonDepthwiseConvolutionBaseWorkload.hpp" + +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases) +{ + const arm_compute::TensorInfo aclInputInfo = + armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = + armcomputetensorutils::BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeightsInfo = + armcomputetensorutils::BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiasesInfo; + arm_compute::TensorInfo *optionalAclBiasesInfo = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiasesInfo = armcomputetensorutils::BuildArmComputeTensorInfo(biases); + optionalAclBiasesInfo = &aclBiasesInfo; + } + + const arm_compute::PadStrideInfo aclPadStrideInfo = + armcomputetensorutils::BuildArmComputePadStrideInfo(descriptor); + const unsigned int aclDepthMultiplier = weights.GetShape()[0]; + + return arm_compute::NEDepthwiseConvolutionLayer::validate(&aclInputInfo, + &aclWeightsInfo, + optionalAclBiasesInfo, + &aclOutputInfo, + aclPadStrideInfo, + aclDepthMultiplier); +} + +} diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp new file mode 100644 index 0000000000..0cead354f8 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionBaseWorkload.hpp @@ -0,0 +1,19 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonDepthwiseConvolutionWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const DepthwiseConvolution2dDescriptor& descriptor, + const TensorInfo& weights, + const TensorInfo& biases); + +} // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp index 11e31c727a..f94cd903b6 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.cpp @@ -16,23 +16,17 @@ using namespace armcomputetensorutils; NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload( const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info) + : FloatWorkload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info) { const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - std::string reasonIfUnsupported; - if (!IsNeonDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo)) - { - throw UnimplementedException(reasonIfUnsupported); - } + m_KernelTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); - BuildArmComputeTensor(m_KernelTensor, weightInfo); - - arm_compute::Tensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, @@ -54,8 +48,8 @@ NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>(); static_cast<arm_compute::NEDepthwiseConvolutionLayer3x3*>( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } @@ -64,28 +58,37 @@ NeonDepthwiseConvolutionFloat32Workload::NeonDepthwiseConvolutionFloat32Workload m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer>(); static_cast<arm_compute::NEDepthwiseConvolutionLayer*>( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<float>()); + InitializeArmComputeTensorDataForFloatTypes(*m_KernelTensor, m_Data.m_Weight); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<float>()); + InitializeArmComputeTensorDataForFloatTypes(*m_BiasTensor, m_Data.m_Bias); } + + m_pDepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void NeonDepthwiseConvolutionFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "NeonDepthwiseConvolutionFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionFloat32Workload_Execute"); BOOST_ASSERT(m_pDepthwiseConvolutionLayer); m_pDepthwiseConvolutionLayer->run(); } +void NeonDepthwiseConvolutionFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp index f9e295f568..ece9f1877b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class NeonDepthwiseConvolutionFloat32Workload : public Float32Workload<DepthwiseConvolution2dQueueDescriptor> +class NeonDepthwiseConvolutionFloat32Workload : public FloatWorkload<DepthwiseConvolution2dQueueDescriptor> { public: NeonDepthwiseConvolutionFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, @@ -20,8 +20,10 @@ public: private: mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer; - arm_compute::Tensor m_KernelTensor; - arm_compute::Tensor m_BiasTensor; + std::unique_ptr<arm_compute::Tensor> m_KernelTensor; + std::unique_ptr<arm_compute::Tensor> m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp index bd034c4f80..45fbcb37ab 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.cpp @@ -20,19 +20,13 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload( { const TensorInfo& weightInfo = m_Data.m_Weight->GetTensorInfo(); - std::string reasonIfUnsupported; - if (!IsNeonDepthwiseConvolution2dDescParamsSupported(&reasonIfUnsupported, m_Data.m_Parameters, weightInfo)) - { - throw UnimplementedException(reasonIfUnsupported); - } + m_KernelTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_KernelTensor, weightInfo); - BuildArmComputeTensor(m_KernelTensor, weightInfo); - - arm_compute::Tensor* optionalBias = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBias = &m_BiasTensor; + m_BiasTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_BiasTensor, m_Data.m_Bias->GetTensorInfo()); } arm_compute::PadStrideInfo padStrideInfo(m_Data.m_Parameters.m_StrideX, @@ -54,8 +48,8 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload( m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer3x3>(); static_cast<arm_compute::NEDepthwiseConvolutionLayer3x3*>( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } @@ -64,28 +58,37 @@ NeonDepthwiseConvolutionUint8Workload::NeonDepthwiseConvolutionUint8Workload( m_pDepthwiseConvolutionLayer = std::make_unique<arm_compute::NEDepthwiseConvolutionLayer>(); static_cast<arm_compute::NEDepthwiseConvolutionLayer*>( m_pDepthwiseConvolutionLayer.get())->configure(&input, - &m_KernelTensor, - optionalBias, + m_KernelTensor.get(), + m_BiasTensor.get(), &output, padStrideInfo); } BOOST_ASSERT(m_pDepthwiseConvolutionLayer); - InitialiseArmComputeTensorData(m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>()); + InitialiseArmComputeTensorData(*m_KernelTensor, m_Data.m_Weight->GetConstTensor<uint8_t>()); - if (optionalBias) + if (m_BiasTensor) { - InitialiseArmComputeTensorData(*optionalBias, m_Data.m_Bias->GetConstTensor<int32_t>()); + InitialiseArmComputeTensorData(*m_BiasTensor, m_Data.m_Bias->GetConstTensor<int32_t>()); } + + m_pDepthwiseConvolutionLayer->prepare(); + FreeUnusedTensors(); } void NeonDepthwiseConvolutionUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::GpuAcc, "NeonDepthwiseConvolutionUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonDepthwiseConvolutionUint8Workload_Execute"); BOOST_ASSERT(m_pDepthwiseConvolutionLayer); m_pDepthwiseConvolutionLayer->run(); } +void NeonDepthwiseConvolutionUint8Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_KernelTensor); + FreeTensorIfUnused(m_BiasTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp index 9cf272e9f5..aca0ba5337 100644 --- a/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonDepthwiseConvolutionUint8Workload.hpp @@ -20,8 +20,10 @@ public: private: mutable std::unique_ptr<arm_compute::IFunction> m_pDepthwiseConvolutionLayer; - arm_compute::Tensor m_KernelTensor; - arm_compute::Tensor m_BiasTensor; + std::unique_ptr<arm_compute::Tensor> m_KernelTensor; + std::unique_ptr<arm_compute::Tensor> m_BiasTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp index a5eec5cadb..c43cfa9c46 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.cpp @@ -9,7 +9,7 @@ namespace armnn { NeonFloorFloat32Workload::NeonFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<FloorQueueDescriptor>(descriptor, info) + : FloatWorkload<FloorQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("NeonFloorFloat32Workload", 1, 1); @@ -21,7 +21,7 @@ NeonFloorFloat32Workload::NeonFloorFloat32Workload(const FloorQueueDescriptor& d void NeonFloorFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonFloorFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFloorFloat32Workload_Execute"); m_Layer.run(); } } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp index f876f1e1bb..56680f1e39 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonFloorFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class NeonFloorFloat32Workload : public Float32Workload<FloorQueueDescriptor> +class NeonFloorFloat32Workload : public FloatWorkload<FloorQueueDescriptor> { public: NeonFloorFloat32Workload(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp index e1c4448642..c3af41e20d 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.cpp @@ -4,16 +4,47 @@ // #include "NeonFullyConnectedFloat32Workload.hpp" -#include "backends/CpuTensorHandle.hpp" + #include "backends/ArmComputeTensorUtils.hpp" +#include "backends/ArmComputeUtils.hpp" +#include "backends/CpuTensorHandle.hpp" namespace armnn { using namespace armcomputetensorutils; +arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = BuildArmComputeTensorInfo(output); + const arm_compute::TensorInfo aclWeights = BuildArmComputeTensorInfo(weights); + + arm_compute::TensorInfo aclBiases; + arm_compute::TensorInfo *optionalAclBiases = nullptr; + if (descriptor.m_BiasEnabled) + { + aclBiases = BuildArmComputeTensorInfo(biases); + optionalAclBiases = &aclBiases; + } + + const arm_compute::FullyConnectedLayerInfo fullyConnectedLayerInfo = + ConvertFullyConnectedDescriptorToAclFullyConnectedLayerInfo(descriptor); + + + return arm_compute::NEFullyConnectedLayer::validate(&aclInput, + &aclWeights, + optionalAclBiases, + &aclOutput, + fullyConnectedLayerInfo); +} + NeonFullyConnectedFloat32Workload::NeonFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<FullyConnectedQueueDescriptor>(descriptor, info) + : FloatWorkload<FullyConnectedQueueDescriptor>(descriptor, info) , m_FullyConnectedLayer(memoryManager) { m_Data.ValidateInputsOutputs("NeonFullyConnectedFloat32Workload", 1, 1); @@ -21,33 +52,45 @@ NeonFullyConnectedFloat32Workload::NeonFullyConnectedFloat32Workload(const Fully arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); - BuildArmComputeTensor(m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); + m_WeightsTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_WeightsTensor, m_Data.m_Weight->GetTensorInfo()); - arm_compute::Tensor* optionalBiasTensor = nullptr; if (m_Data.m_Parameters.m_BiasEnabled) { - BuildArmComputeTensor(m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); - optionalBiasTensor = &m_BiasesTensor; + m_BiasesTensor = std::make_unique<arm_compute::Tensor>(); + BuildArmComputeTensor(*m_BiasesTensor, m_Data.m_Bias->GetTensorInfo()); } // Construct - m_FullyConnectedLayer.configure( - &input, &m_WeightsTensor, optionalBiasTensor, &output, m_Data.m_Parameters.m_TransposeWeightMatrix); + arm_compute::FullyConnectedLayerInfo fc_info; + fc_info.transpose_weights = m_Data.m_Parameters.m_TransposeWeightMatrix; + m_FullyConnectedLayer.configure(&input, m_WeightsTensor.get(), m_BiasesTensor.get(), &output, fc_info); // Allocate - InitialiseArmComputeTensorData(m_WeightsTensor, m_Data.m_Weight->GetConstTensor<float>()); + InitializeArmComputeTensorDataForFloatTypes(*m_WeightsTensor, m_Data.m_Weight); - if (optionalBiasTensor) + if (m_BiasesTensor) { - InitialiseArmComputeTensorData(*optionalBiasTensor, m_Data.m_Bias->GetConstTensor<float>()); + InitializeArmComputeTensorDataForFloatTypes(*m_BiasesTensor, m_Data.m_Bias); } + + // Force Compute Library to perform the necessary copying and reshaping, after which + // delete all the input tensors that will no longer be needed + m_FullyConnectedLayer.prepare(); + FreeUnusedTensors(); } void NeonFullyConnectedFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonFullyConnectedFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonFullyConnectedFloat32Workload_Execute"); m_FullyConnectedLayer.run(); } +void NeonFullyConnectedFloat32Workload::FreeUnusedTensors() +{ + FreeTensorIfUnused(m_WeightsTensor); + FreeTensorIfUnused(m_BiasesTensor); +} + } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp index 9c722dc573..684b5e0753 100644 --- a/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonFullyConnectedFloat32Workload.hpp @@ -14,7 +14,13 @@ namespace armnn { -class NeonFullyConnectedFloat32Workload : public Float32Workload<FullyConnectedQueueDescriptor> +arm_compute::Status NeonFullyConnectedWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, + const FullyConnectedDescriptor& descriptor); + +class NeonFullyConnectedFloat32Workload : public FloatWorkload<FullyConnectedQueueDescriptor> { public: NeonFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info, @@ -23,8 +29,11 @@ public: private: mutable arm_compute::NEFullyConnectedLayer m_FullyConnectedLayer; - arm_compute::Tensor m_WeightsTensor; - arm_compute::Tensor m_BiasesTensor; + + std::unique_ptr<arm_compute::Tensor> m_WeightsTensor; + std::unique_ptr<arm_compute::Tensor> m_BiasesTensor; + + void FreeUnusedTensors(); }; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp index 9f79fa09de..a3ae33f41f 100644 --- a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.cpp @@ -9,9 +9,21 @@ namespace armnn { +arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + arm_compute::NormalizationLayerInfo normalizationInfo = + CreateAclNormalizationLayerInfoForL2Normalization(input); + + return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo); +} + NeonL2NormalizationFloat32Workload::NeonL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<L2NormalizationQueueDescriptor>(descriptor, info) + : FloatWorkload<L2NormalizationQueueDescriptor>(descriptor, info) , m_Layer(memoryManager) { m_Data.ValidateInputsOutputs("NeonL2NormalizationFloat32Workload", 1, 1); @@ -23,7 +35,7 @@ NeonL2NormalizationFloat32Workload::NeonL2NormalizationFloat32Workload(const L2N void NeonL2NormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonL2NormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonL2NormalizationFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp index 2b4a1fef37..c3fcde5a57 100644 --- a/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonL2NormalizationFloat32Workload.hpp @@ -14,7 +14,10 @@ namespace armnn { -class NeonL2NormalizationFloat32Workload : public Float32Workload<L2NormalizationQueueDescriptor> +arm_compute::Status NeonL2NormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output); + +class NeonL2NormalizationFloat32Workload : public FloatWorkload<L2NormalizationQueueDescriptor> { public: NeonL2NormalizationFloat32Workload(const L2NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp new file mode 100644 index 0000000000..ba1369e179 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.cpp @@ -0,0 +1,22 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonLstmFloat32Workload.hpp" + +namespace armnn +{ +NeonLstmFloat32Workload::NeonLstmFloat32Workload(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) + : FloatWorkload<LstmQueueDescriptor>(descriptor, info) +{ + m_Data.ValidateInputsOutputs("NeonLstmFloat32Workload", 1, 1); +} + +void NeonLstmFloat32Workload::Execute() const +{ + throw armnn::Exception("No implementation of Lstm in the Neon backend!"); +} + +} // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp new file mode 100644 index 0000000000..78ee1da341 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonLstmFloat32Workload.hpp @@ -0,0 +1,20 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <backends/NeonWorkloadUtils.hpp> + +namespace armnn +{ + +class NeonLstmFloat32Workload : public FloatWorkload<LstmQueueDescriptor> +{ +public: + NeonLstmFloat32Workload(const LstmQueueDescriptor& descriptor, const WorkloadInfo& info); + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp index 7520e8768e..30dd283620 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonMergerFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClMergerFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerFloat32Workload_Execute"); NeonBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp index 5c889c2af0..7b8ee9881f 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonMergerFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class NeonMergerFloat32Workload : public NeonBaseMergerWorkload<DataType::Float32> +class NeonMergerFloat32Workload : public NeonBaseMergerWorkload<DataType::Float16, DataType::Float32> { public: - using NeonBaseMergerWorkload<DataType::Float32>::NeonBaseMergerWorkload; + using NeonBaseMergerWorkload<DataType::Float16, DataType::Float32>::NeonBaseMergerWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp index 51578e5bff..caccdd443a 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonMergerUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonMergerUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClMergerUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMergerUint8Workload_Execute"); NeonBaseMergerWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp index 58ce7b74ba..a8a3cd77b4 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.cpp @@ -9,9 +9,28 @@ namespace armnn { +arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output) +{ + const arm_compute::TensorInfo aclInput1 = armcomputetensorutils::BuildArmComputeTensorInfo(input0); + const arm_compute::TensorInfo aclInput2 = armcomputetensorutils::BuildArmComputeTensorInfo(input1); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + // At the time of writing, configure() will fail if a rounding policy other than TO_ZERO is supplied to it, + // when providing a scale of 1.0 for F32 tensors, even though the provided rounding policy appears to be + // ignored for F32 tensors. + return arm_compute::NEPixelWiseMultiplication::validate(&aclInput1, + &aclInput2, + &aclOutput, + 1.0f, + arm_compute::ConvertPolicy::SATURATE, + arm_compute::RoundingPolicy::TO_ZERO); +} + NeonMultiplicationFloat32Workload::NeonMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<MultiplicationQueueDescriptor>(descriptor, info) + : FloatWorkload<MultiplicationQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("NeonMultiplicationFloat32Workload", 2, 1); @@ -32,7 +51,7 @@ NeonMultiplicationFloat32Workload::NeonMultiplicationFloat32Workload(const Multi void NeonMultiplicationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonMultiplicationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonMultiplicationFloat32Workload_Execute"); m_PixelWiseMultiplication.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp index ed5ead3700..62e84a2e07 100644 --- a/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonMultiplicationFloat32Workload.hpp @@ -9,8 +9,11 @@ namespace armnn { +arm_compute::Status NeonMultiplicationWorkloadValidate(const TensorInfo& input0, + const TensorInfo& input1, + const TensorInfo& output); -class NeonMultiplicationFloat32Workload : public Float32Workload<MultiplicationQueueDescriptor> +class NeonMultiplicationFloat32Workload : public FloatWorkload<MultiplicationQueueDescriptor> { public: NeonMultiplicationFloat32Workload(const MultiplicationQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp index 0fd0dcc420..20936a2760 100644 --- a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.cpp @@ -6,13 +6,28 @@ #include "NeonNormalizationFloat32Workload.hpp" #include "backends/NeonLayerSupport.hpp" #include "backends/ArmComputeUtils.hpp" +#include "backends/ArmComputeTensorUtils.hpp" namespace armnn { +arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const NormalizationDescriptor& descriptor) +{ + const arm_compute::TensorInfo aclInput = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutput = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + arm_compute::NormalizationLayerInfo normalizationInfo = + armcomputetensorutils::BuildArmComputeNormalizationLayerInfo(descriptor); + + return arm_compute::NENormalizationLayer::validate(&aclInput, &aclOutput, normalizationInfo); +} + NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, - const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<NormalizationQueueDescriptor>(descriptor, info) + const WorkloadInfo& info, + std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) + : FloatWorkload<NormalizationQueueDescriptor>(descriptor, info) , m_NormalizationLayer(memoryManager) { m_Data.ValidateInputsOutputs("NeonNormalizationFloat32Workload", 1, 1); @@ -22,7 +37,7 @@ NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const Normali throw UnimplementedException(reasonIfUnsupported); } - // input and output tensors have to have the same dimensionality + // Input and output tensors have to have the same dimensionality. if (info.m_InputTensorInfos[0].GetShape()[1] != info.m_OutputTensorInfos[0].GetShape()[1] || info.m_InputTensorInfos[0].GetShape()[0] != info.m_OutputTensorInfos[0].GetShape()[0] || info.m_InputTensorInfos[0].GetShape()[3] != info.m_OutputTensorInfos[0].GetShape()[3] @@ -48,7 +63,7 @@ NeonNormalizationFloat32Workload::NeonNormalizationFloat32Workload(const Normali void NeonNormalizationFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonNormalizationFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonNormalizationFloat32Workload_Execute"); m_NormalizationLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp index 24b6da8528..8f0823454b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonNormalizationFloat32Workload.hpp @@ -12,7 +12,11 @@ namespace armnn { -class NeonNormalizationFloat32Workload : public Float32Workload<NormalizationQueueDescriptor> +arm_compute::Status NeonNormalizationWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const NormalizationDescriptor& descriptor); + +class NeonNormalizationFloat32Workload : public FloatWorkload<NormalizationQueueDescriptor> { public: NeonNormalizationFloat32Workload(const NormalizationQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp index e0a0457422..c27797ee4e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.cpp @@ -24,10 +24,10 @@ arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input, armcomputetensorutils::BuildArmComputePermutationVector(mappings)); } -template <armnn::DataType DataType> -NeonPermuteWorkload<DataType>::NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor, +template <armnn::DataType... DataTypes> +NeonPermuteWorkload<DataTypes...>::NeonPermuteWorkload(const PermuteQueueDescriptor& descriptor, const WorkloadInfo& info) - : TypedWorkload<PermuteQueueDescriptor, DataType>(descriptor, info) + : TypedWorkload<PermuteQueueDescriptor, DataTypes...>(descriptor, info) { using armcomputetensorutils::BuildArmComputePermutationVector; @@ -37,18 +37,18 @@ NeonPermuteWorkload<DataType>::NeonPermuteWorkload(const PermuteQueueDescriptor& arm_compute::ITensor& output = static_cast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); const armnn::PermutationVector& mappings = m_Data.m_Parameters.m_DimMappings; - // Run the layer + // Run the layer. m_PermuteFunction.configure(&input, &output, BuildArmComputePermutationVector(mappings)); } -template <armnn::DataType DataType> -void NeonPermuteWorkload<DataType>::Execute() const +template <armnn::DataType... DataTypes> +void NeonPermuteWorkload<DataTypes...>::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, GetName() + "_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON(GetName() + "_Execute"); m_PermuteFunction.run(); } -template class NeonPermuteWorkload<DataType::Float32>; +template class NeonPermuteWorkload<DataType::Float16, DataType::Float32>; template class NeonPermuteWorkload<DataType::QuantisedAsymm8>; } // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp index 56e8719d6c..06b2dc692b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonPermuteWorkload.hpp @@ -7,6 +7,7 @@ #include "backends/Workload.hpp" #include "backends/WorkloadData.hpp" +#include "backends/NeonWorkloadUtils.hpp" #include <armnn/TypesUtils.hpp> #include <arm_compute/runtime/NEON/functions/NEPermute.h> @@ -18,13 +19,13 @@ namespace armnn arm_compute::Status NeonPermuteWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const PermuteDescriptor& descriptor); -template <armnn::DataType DataType> -class NeonPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataType> +template <armnn::DataType... DataTypes> +class NeonPermuteWorkload : public TypedWorkload<PermuteQueueDescriptor, DataTypes...> { public: static const std::string& GetName() { - static const std::string name = std::string("NeonPermute") + GetDataTypeName(DataType) + "Workload"; + static const std::string name = std::string("NeonPermuteWorkload"); return name; } @@ -32,11 +33,11 @@ public: void Execute() const override; private: - using TypedWorkload<PermuteQueueDescriptor, DataType>::m_Data; + using TypedWorkload<PermuteQueueDescriptor, DataTypes...>::m_Data; mutable arm_compute::NEPermute m_PermuteFunction; }; -using NeonPermuteFloat32Workload = NeonPermuteWorkload<DataType::Float32>; +using NeonPermuteFloatWorkload = NeonPermuteWorkload<DataType::Float16, DataType::Float32>; using NeonPermuteUint8Workload = NeonPermuteWorkload<DataType::QuantisedAsymm8>; -} //namespace armnn +} // namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp index 6d6a492155..3585d36ba3 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.cpp @@ -25,10 +25,10 @@ arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input, return arm_compute::NEPoolingLayer::validate(&aclInputInfo, &aclOutputInfo, layerInfo); } -template <armnn::DataType dataType> -NeonPooling2dBaseWorkload<dataType>::NeonPooling2dBaseWorkload( +template <armnn::DataType... dataTypes> +NeonPooling2dBaseWorkload<dataTypes...>::NeonPooling2dBaseWorkload( const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name) - : TypedWorkload<Pooling2dQueueDescriptor, dataType>(descriptor, info) + : TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>(descriptor, info) { m_Data.ValidateInputsOutputs(name, 1, 1); @@ -40,7 +40,7 @@ NeonPooling2dBaseWorkload<dataType>::NeonPooling2dBaseWorkload( m_PoolingLayer.configure(&input, &output, layerInfo); } -template class NeonPooling2dBaseWorkload<DataType::Float32>; +template class NeonPooling2dBaseWorkload<DataType::Float16, DataType::Float32>; template class NeonPooling2dBaseWorkload<DataType::QuantisedAsymm8>; } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp index 9461982f86..2e85e937fa 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dBaseWorkload.hpp @@ -14,12 +14,12 @@ arm_compute::Status NeonPooling2dWorkloadValidate(const TensorInfo& input, const TensorInfo& output, const Pooling2dDescriptor& descriptor); -// Base class template providing an implementation of the Pooling2d layer common to all data types -template <armnn::DataType dataType> -class NeonPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataType> +// Base class template providing an implementation of the Pooling2d layer common to all data types. +template <armnn::DataType... dataTypes> +class NeonPooling2dBaseWorkload : public TypedWorkload<Pooling2dQueueDescriptor, dataTypes...> { public: - using TypedWorkload<Pooling2dQueueDescriptor, dataType>::m_Data; + using TypedWorkload<Pooling2dQueueDescriptor, dataTypes...>::m_Data; NeonPooling2dBaseWorkload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info, const std::string& name); diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp index ba2aa20924..cb690c51b8 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.cpp @@ -12,13 +12,14 @@ namespace armnn NeonPooling2dFloat32Workload::NeonPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info) - : NeonPooling2dBaseWorkload<armnn::DataType::Float32>(descriptor, info, "NeonPooling2dFloat32Workload") + : NeonPooling2dBaseWorkload<armnn::DataType::Float16, armnn::DataType::Float32>(descriptor, info, + "NeonPooling2dFloat32Workload") { } void NeonPooling2dFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonPooling2dFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dFloat32Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp index 6cfc9cc96f..36c4e7edf1 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dFloat32Workload.hpp @@ -11,7 +11,8 @@ namespace armnn { -class NeonPooling2dFloat32Workload : public NeonPooling2dBaseWorkload<armnn::DataType::Float32> +class NeonPooling2dFloat32Workload : public NeonPooling2dBaseWorkload<armnn::DataType::Float16, + armnn::DataType::Float32> { public: NeonPooling2dFloat32Workload(const Pooling2dQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp index 0778794081..3e06d08dea 100644 --- a/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonPooling2dUint8Workload.cpp @@ -18,7 +18,7 @@ NeonPooling2dUint8Workload::NeonPooling2dUint8Workload(const Pooling2dQueueDescr void NeonPooling2dUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonPooling2dUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonPooling2dUint8Workload_Execute"); m_PoolingLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp index 317d16f6bd..93f6eb8ef5 100644 --- a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.cpp @@ -12,7 +12,7 @@ namespace armnn NeonReshapeFloat32Workload::NeonReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info) - : Float32Workload<ReshapeQueueDescriptor>(descriptor, info) + : FloatWorkload<ReshapeQueueDescriptor>(descriptor, info) { m_Data.ValidateInputsOutputs("NeonReshapeFloat32Workload", 1, 1); @@ -24,7 +24,7 @@ NeonReshapeFloat32Workload::NeonReshapeFloat32Workload(const ReshapeQueueDescrip void NeonReshapeFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonReshapeFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeFloat32Workload_Execute"); m_Layer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp index 27f4aea9e7..3e5cca1b9e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonReshapeFloat32Workload.hpp @@ -10,7 +10,7 @@ namespace armnn { -class NeonReshapeFloat32Workload : public Float32Workload<ReshapeQueueDescriptor> +class NeonReshapeFloat32Workload : public FloatWorkload<ReshapeQueueDescriptor> { public: NeonReshapeFloat32Workload(const ReshapeQueueDescriptor& descriptor, const WorkloadInfo& info); diff --git a/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp index 06f57c1e0f..b31bdcd3d0 100644 --- a/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonReshapeUint8Workload.cpp @@ -24,7 +24,7 @@ NeonReshapeUint8Workload::NeonReshapeUint8Workload(const ReshapeQueueDescriptor& void NeonReshapeUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonReshapeUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonReshapeUint8Workload_Execute"); m_Layer.run(); } } //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp new file mode 100644 index 0000000000..3efffafe25 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.cpp @@ -0,0 +1,30 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonSoftmaxBaseWorkload.hpp" + +#include "backends/ArmComputeTensorUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const SoftmaxDescriptor& descriptor) +{ + // NOTE: We report 4D Softmax as unsupported until full support is added to ACL + if(input.GetShape().GetNumDimensions() >= 4u) + { + return arm_compute::Status(arm_compute::ErrorCode::RUNTIME_ERROR, "4d softmax is not supported"); + } + + const arm_compute::TensorInfo aclInputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(input); + const arm_compute::TensorInfo aclOutputInfo = armcomputetensorutils::BuildArmComputeTensorInfo(output); + + return arm_compute::NESoftmaxLayer::validate(&aclInputInfo, &aclOutputInfo, descriptor.m_Beta); +} + +} //namespace armnn + diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp new file mode 100644 index 0000000000..b9b21fb254 --- /dev/null +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxBaseWorkload.hpp @@ -0,0 +1,17 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/NeonWorkloadUtils.hpp" + +namespace armnn +{ + +arm_compute::Status NeonSoftmaxWorkloadValidate(const TensorInfo& input, + const TensorInfo& output, + const SoftmaxDescriptor& descriptor); + +} //namespace armnn diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp index 5e2925ca02..027b508ad5 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.cpp @@ -10,12 +10,12 @@ namespace armnn NeonSoftmaxFloat32Workload::NeonSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) - : Float32Workload<SoftmaxQueueDescriptor>(descriptor, info) + : FloatWorkload<SoftmaxQueueDescriptor>(descriptor, info) , m_SoftmaxLayer(memoryManager) { m_Data.ValidateInputsOutputs("NeonSoftmaxFloat32Workload", 1, 1); - // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions + // The ArmCompute softmax layer uses 2D input/output tensors, so flatten the first three dimensions. arm_compute::ITensor& input = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Inputs[0])->GetTensor(); arm_compute::ITensor& output = boost::polymorphic_downcast<INeonTensorHandle*>(m_Data.m_Outputs[0])->GetTensor(); @@ -24,7 +24,7 @@ NeonSoftmaxFloat32Workload::NeonSoftmaxFloat32Workload(const SoftmaxQueueDescrip void NeonSoftmaxFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSoftmaxFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxFloat32Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp index 91d25b47f8..3656a26a3c 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxFloat32Workload.hpp @@ -14,7 +14,7 @@ namespace armnn { -class NeonSoftmaxFloat32Workload : public Float32Workload<SoftmaxQueueDescriptor> +class NeonSoftmaxFloat32Workload : public FloatWorkload<SoftmaxQueueDescriptor> { public: NeonSoftmaxFloat32Workload(const SoftmaxQueueDescriptor& descriptor, const WorkloadInfo& info, diff --git a/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp index eb4a23c13c..4b0c05b25b 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSoftmaxUint8Workload.cpp @@ -32,7 +32,7 @@ NeonSoftmaxUint8Workload::NeonSoftmaxUint8Workload(const SoftmaxQueueDescriptor& void NeonSoftmaxUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "ClSoftmaxUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSoftmaxUint8Workload_Execute"); m_SoftmaxLayer.run(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp index 13701d2ed3..996fc15adb 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonSplitterFloat32Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSplitterFloat32Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterFloat32Workload_Execute"); NeonBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp index 432f5de4eb..9f6dc75499 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp +++ b/src/armnn/backends/NeonWorkloads/NeonSplitterFloat32Workload.hpp @@ -10,10 +10,10 @@ namespace armnn { -class NeonSplitterFloat32Workload : public NeonBaseSplitterWorkload<DataType::Float32> +class NeonSplitterFloat32Workload : public NeonBaseSplitterWorkload<DataType::Float16, DataType::Float32> { public: - using NeonBaseSplitterWorkload<DataType::Float32>::NeonBaseSplitterWorkload; + using NeonBaseSplitterWorkload<DataType::Float16, DataType::Float32>::NeonBaseSplitterWorkload; virtual void Execute() const override; }; diff --git a/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp b/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp index 90d24d3ffd..0d6328ff7e 100644 --- a/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp +++ b/src/armnn/backends/NeonWorkloads/NeonSplitterUint8Workload.cpp @@ -10,7 +10,7 @@ namespace armnn void NeonSplitterUint8Workload::Execute() const { - ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuAcc, "NeonSplitterUint8Workload_Execute"); + ARMNN_SCOPED_PROFILING_EVENT_NEON("NeonSplitterUint8Workload_Execute"); NeonBaseSplitterWorkload::Execute(); } diff --git a/src/armnn/backends/OutputHandler.cpp b/src/armnn/backends/OutputHandler.cpp index 54afe565a9..ccc62c89ce 100644 --- a/src/armnn/backends/OutputHandler.cpp +++ b/src/armnn/backends/OutputHandler.cpp @@ -30,12 +30,4 @@ void OutputHandler::CollectWorkloadOutputs(WorkloadDataCollector& dataCollector) dataCollector.Push(m_TensorHandle.get(), m_TensorInfo); } -void OutputHandler::AllocateTensors() -{ - if (m_TensorHandle) - { - m_TensorHandle->Allocate(); - } -} - } // namespace armnn diff --git a/src/armnn/backends/OutputHandler.hpp b/src/armnn/backends/OutputHandler.hpp index 9cc87c6095..ed95577cca 100644 --- a/src/armnn/backends/OutputHandler.hpp +++ b/src/armnn/backends/OutputHandler.hpp @@ -31,30 +31,27 @@ class WorkloadDataCollector; class OutputHandler { public: - /// @brief Sets the TensorInfo used by this output handler. - /// @param tensorInfo TensorInfo for the output. + /// @brief - Sets the TensorInfo used by this output handler. + /// @param tensorInfo - TensorInfo for the output. void SetTensorInfo(const TensorInfo& tensorInfo); - /// @brief Create tensor handlers used by the intermediate tensors. Does not allocate memory. - /// @param factory Factory to be used for handler creation. + /// @brief - Creates tensor handlers used by the intermediate tensors. Does not allocate memory. + /// @param factory - Factory to be used for handler creation. void CreateTensorHandles(const IWorkloadFactory& factory); - /// @brief Get the matching TensorInfo for the output - /// @return Reference to the output TensorInfo. + /// @brief - Gets the matching TensorInfo for the output. + /// @return - References to the output TensorInfo. const TensorInfo& GetTensorInfo() const { return m_TensorInfo; } - /// @brief Get the allocated tensor memory. - /// @return Pointer to the tensor memory + /// @brief - Gets the allocated tensor memory. + /// @return - Pointer to the tensor memory. ITensorHandle* GetData() const { return m_TensorHandle.get(); } - /// Fill the outputs for a given queue descriptor + /// Fill the outputs for a given queue descriptor. void CollectWorkloadOutputs(WorkloadDataCollector& dataCollector) const; void SetData(std::unique_ptr<ITensorHandle> data) { m_TensorHandle = std::move(data); } - /// @brief Allocate memory for all the tensors assigned to the handlers - void AllocateTensors(); - /// @brief Returns true if SetTensorInfo() has been called at least once on this. bool IsTensorInfoSet() const { return m_bTensorInfoSet; } private: diff --git a/src/armnn/backends/RefLayerSupport.cpp b/src/armnn/backends/RefLayerSupport.cpp index 0b94656ded..ca4fca6f31 100644 --- a/src/armnn/backends/RefLayerSupport.cpp +++ b/src/armnn/backends/RefLayerSupport.cpp @@ -10,7 +10,6 @@ #include <armnn/Tensor.hpp> #include <boost/core/ignore_unused.hpp> - #include "InternalTypes.hpp" using namespace boost; @@ -27,15 +26,18 @@ bool IsSupportedForDataTypeRef(std::string* reasonIfUnsupported, { return IsSupportedForDataTypeGeneric(reasonIfUnsupported, dataType, + &FalseFunc<Params...>, floatFuncPtr, uint8FuncPtr, std::forward<Params>(params)...); } bool IsActivationSupportedRef(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), @@ -57,6 +59,11 @@ bool IsAdditionSupportedRef(const TensorInfo& input0, } bool IsBatchNormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported) { @@ -94,12 +101,16 @@ bool IsConvolution2dSupportedRef(const TensorInfo& input, } bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); ignore_unused(weights); + ignore_unused(biases); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), &TrueFunc<>, @@ -107,10 +118,16 @@ bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input, } bool IsFullyConnectedSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); + ignore_unused(weights); + ignore_unused(biases); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), &TrueFunc<>, @@ -127,8 +144,10 @@ bool IsInputSupportedRef(const TensorInfo& input, } bool IsL2NormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported) { + ignore_unused(output); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), &TrueFunc<>, @@ -148,9 +167,11 @@ bool IsMergerSupportedRef(const std::vector<const TensorInfo*> inputs, bool IsMultiplicationSupportedRef(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported) { ignore_unused(input1); + ignore_unused(output); return IsSupportedForDataTypeRef(reasonIfUnsupported, input0.GetDataType(), &TrueFunc<>, @@ -212,9 +233,11 @@ bool IsResizeBilinearSupportedRef(const TensorInfo& input, } bool IsSoftmaxSupportedRef(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported) { + ignore_unused(output); ignore_unused(descriptor); return IsSupportedForDataTypeRef(reasonIfUnsupported, input.GetDataType(), @@ -264,4 +287,78 @@ bool IsFloorSupportedRef(const TensorInfo& input, &FalseFuncU8<>); } +bool IsLstmSupportedRef(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported) +{ + ignore_unused(input); + ignore_unused(outputStateIn); + ignore_unused(cellStateIn); + ignore_unused(scratchBuffer); + ignore_unused(outputStateOut); + ignore_unused(cellStateOut); + ignore_unused(output); + ignore_unused(descriptor); + ignore_unused(inputToForgetWeights); + ignore_unused(inputToCellWeights); + ignore_unused(inputToOutputWeights); + ignore_unused(recurrentToForgetWeights); + ignore_unused(recurrentToCellWeights); + ignore_unused(recurrentToOutputWeights); + ignore_unused(forgetGateBias); + ignore_unused(cellBias); + ignore_unused(outputGateBias); + ignore_unused(inputToInputWeights); + ignore_unused(recurrentToInputWeights); + ignore_unused(cellToInputWeights); + ignore_unused(inputGateBias); + ignore_unused(projectionWeights); + ignore_unused(projectionBias); + ignore_unused(cellToForgetWeights); + ignore_unused(cellToOutputWeights); + return false; +} + +bool IsConvertFp16ToFp32SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + return (IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &TrueFunc<>, + &FalseInputFuncF32<>, + &FalseFuncU8<>) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + output.GetDataType(), + &FalseOutputFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>)); +} + +bool IsConvertFp32ToFp16SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported) +{ + return (IsSupportedForDataTypeGeneric(reasonIfUnsupported, + input.GetDataType(), + &FalseInputFuncF16<>, + &TrueFunc<>, + &FalseFuncU8<>) && + IsSupportedForDataTypeGeneric(reasonIfUnsupported, + output.GetDataType(), + &TrueFunc<>, + &FalseOutputFuncF32<>, + &FalseFuncU8<>)); +} + } diff --git a/src/armnn/backends/RefLayerSupport.hpp b/src/armnn/backends/RefLayerSupport.hpp index 9db1c14596..5e543ac537 100644 --- a/src/armnn/backends/RefLayerSupport.hpp +++ b/src/armnn/backends/RefLayerSupport.hpp @@ -7,11 +7,14 @@ #include <armnn/DescriptorsFwd.hpp> #include <armnn/Types.hpp> #include <armnn/Tensor.hpp> +#include <layers/LstmLayer.hpp> +#include <boost/optional.hpp> namespace armnn { bool IsActivationSupportedRef(const TensorInfo& input, + const TensorInfo& output, const ActivationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -21,6 +24,11 @@ bool IsAdditionSupportedRef(const TensorInfo& input0, std::string* reasonIfUnsupported = nullptr); bool IsBatchNormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& mean, + const TensorInfo& var, + const TensorInfo& beta, + const TensorInfo& gamma, const BatchNormalizationDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -35,11 +43,16 @@ bool IsConvolution2dSupportedRef(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsDepthwiseConvolutionSupportedRef(const TensorInfo& input, + const TensorInfo& output, const DepthwiseConvolution2dDescriptor& descriptor, const TensorInfo& weights, + const TensorInfo& biases, std::string* reasonIfUnsupported = nullptr); bool IsFullyConnectedSupportedRef(const TensorInfo& input, + const TensorInfo& output, + const TensorInfo& weights, + const TensorInfo& biases, const FullyConnectedDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -47,14 +60,30 @@ bool IsInputSupportedRef(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsL2NormalizationSupportedRef(const TensorInfo& input, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsLstmSupportedRef(const TensorInfo& input, const TensorInfo& outputStateIn, + const TensorInfo& cellStateIn, const TensorInfo& scratchBuffer, + const TensorInfo& outputStateOut, const TensorInfo& cellStateOut, + const TensorInfo& output, const LstmDescriptor& descriptor, + const TensorInfo& inputToForgetWeights, const TensorInfo& inputToCellWeights, + const TensorInfo& inputToOutputWeights, const TensorInfo& recurrentToForgetWeights, + const TensorInfo& recurrentToCellWeights, const TensorInfo& recurrentToOutputWeights, + const TensorInfo& forgetGateBias, const TensorInfo& cellBias, + const TensorInfo& outputGateBias, const TensorInfo* inputToInputWeights, + const TensorInfo* recurrentToInputWeights, const TensorInfo* cellToInputWeights, + const TensorInfo* inputGateBias, const TensorInfo* projectionWeights, + const TensorInfo* projectionBias, const TensorInfo* cellToForgetWeights, + const TensorInfo* cellToOutputWeights, std::string* reasonIfUnsupported = nullptr); + bool IsMergerSupportedRef(const std::vector<const TensorInfo*> inputs, const OriginsDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); bool IsMultiplicationSupportedRef(const TensorInfo& input0, const TensorInfo& input1, + const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); bool IsNormalizationSupportedRef(const TensorInfo& input, @@ -79,6 +108,7 @@ bool IsResizeBilinearSupportedRef(const TensorInfo& input, std::string* reasonIfUnsupported = nullptr); bool IsSoftmaxSupportedRef(const TensorInfo& input, + const TensorInfo& output, const SoftmaxDescriptor& descriptor, std::string* reasonIfUnsupported = nullptr); @@ -97,4 +127,12 @@ bool IsFloorSupportedRef(const TensorInfo& input, const TensorInfo& output, std::string* reasonIfUnsupported = nullptr); +bool IsConvertFp16ToFp32SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + +bool IsConvertFp32ToFp16SupportedRef(const TensorInfo& input, + const TensorInfo& output, + std::string* reasonIfUnsupported = nullptr); + } diff --git a/src/armnn/backends/RefWorkloadFactory.cpp b/src/armnn/backends/RefWorkloadFactory.cpp index d7d498e89e..9294c5accc 100644 --- a/src/armnn/backends/RefWorkloadFactory.cpp +++ b/src/armnn/backends/RefWorkloadFactory.cpp @@ -18,22 +18,15 @@ template <typename F32Workload, typename U8Workload, typename QueueDescriptorTyp std::unique_ptr<IWorkload> RefWorkloadFactory::MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const { - if (!IsOperationQueueDescriptor(descriptor) || m_OperationWorkloadsAllowed) - { - return armnn::MakeWorkload<F32Workload, U8Workload>(descriptor, info); - } - else - { - return std::unique_ptr<IWorkload>(); - } + return armnn::MakeWorkload<NullWorkload, F32Workload, U8Workload>(descriptor, info); } -RefWorkloadFactory::RefWorkloadFactory(bool operationWorkloadsAllowed) - : m_OperationWorkloadsAllowed(operationWorkloadsAllowed) +RefWorkloadFactory::RefWorkloadFactory() { } -bool RefWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool RefWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported) { return IWorkloadFactory::IsLayerSupported(Compute::CpuRef, layer, dataType, outReasonIfUnsupported); } @@ -60,7 +53,7 @@ std::unique_ptr<IWorkload> RefWorkloadFactory::CreateInput(const InputQueueDescr throw InvalidArgumentException("RefWorkloadFactory::CreateInput: data input and output differ in byte count."); } - return MakeWorkload<CopyFromCpuToCpuFloat32Workload, CopyFromCpuToCpuUint8Workload>(descriptor, info); + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<IWorkload> RefWorkloadFactory::CreateOutput(const OutputQueueDescriptor& descriptor, @@ -79,7 +72,7 @@ std::unique_ptr<IWorkload> RefWorkloadFactory::CreateOutput(const OutputQueueDes throw InvalidArgumentException("RefWorkloadFactory::CreateOutput: data input and output differ in byte count."); } - return MakeWorkload<CopyFromCpuToCpuFloat32Workload, CopyFromCpuToCpuUint8Workload>(descriptor, info); + return MakeWorkload<CopyMemGenericWorkload, CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<IWorkload> RefWorkloadFactory::CreateActivation(const ActivationQueueDescriptor& descriptor, @@ -168,25 +161,7 @@ std::unique_ptr<armnn::IWorkload> RefWorkloadFactory::CreateMemCopy(const MemCop { throw InvalidArgumentException("RefWorkloadFactory: CreateMemCopy() expected an input tensor."); } - // Create a workload that will copy tensor data from the inputs, which can have a number of different formats, - // to CPU tensors. - switch (descriptor.m_Inputs[0]->GetType()) - { -#if ARMCOMPUTECL_ENABLED - case ITensorHandle::CL: - { - return MakeWorkload<CopyFromClToCpuFloat32Workload, CopyFromClToCpuUint8Workload>(descriptor, info); - } -#endif -#if ARMCOMPUTENEON_ENABLED - case ITensorHandle::Neon: - { - return MakeWorkload<CopyFromNeonToCpuFloat32Workload, CopyFromNeonToCpuUint8Workload>(descriptor, info); - } -#endif - default: - throw InvalidArgumentException("RefWorkloadFactory: Destination type not supported for MemCopy Workload."); - } + return std::make_unique<CopyMemGenericWorkload>(descriptor, info); } std::unique_ptr<IWorkload> RefWorkloadFactory::CreateResizeBilinear(const ResizeBilinearQueueDescriptor& descriptor, @@ -221,9 +196,29 @@ std::unique_ptr<IWorkload> RefWorkloadFactory::CreateReshape(const ReshapeQueueD } std::unique_ptr<IWorkload> RefWorkloadFactory::CreateFloor(const FloorQueueDescriptor& descriptor, - const WorkloadInfo& info) const + const WorkloadInfo& info) const { return MakeWorkload<RefFloorFloat32Workload, NullWorkload>(descriptor, info); } +std::unique_ptr<IWorkload> RefWorkloadFactory::CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return MakeWorkload<RefLstmFloat32Workload, NullWorkload>(descriptor, info); +} + +std::unique_ptr<IWorkload> RefWorkloadFactory::CreateConvertFp16ToFp32( + const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique<RefConvertFp16ToFp32Workload>(descriptor, info); +} + +std::unique_ptr<IWorkload> RefWorkloadFactory::CreateConvertFp32ToFp16( + const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const +{ + return std::make_unique<RefConvertFp32ToFp16Workload>(descriptor, info); +} + } // namespace armnn diff --git a/src/armnn/backends/RefWorkloadFactory.hpp b/src/armnn/backends/RefWorkloadFactory.hpp index 3fab490ad8..ee8639f8ed 100644 --- a/src/armnn/backends/RefWorkloadFactory.hpp +++ b/src/armnn/backends/RefWorkloadFactory.hpp @@ -8,6 +8,7 @@ #include "OutputHandler.hpp" #include <boost/core/ignore_unused.hpp> +#include <boost/optional.hpp> namespace armnn { @@ -24,16 +25,17 @@ constexpr bool IsOperationQueueDescriptor(const ConstantQueueDescriptor&) { retu template <> constexpr bool IsOperationQueueDescriptor(const PermuteQueueDescriptor&) { return false; } -// Reference workload factory +// Reference workload factory. class RefWorkloadFactory : public IWorkloadFactory { public: - explicit RefWorkloadFactory(bool operationWorkloadsAllowed = true); - virtual ~RefWorkloadFactory() { }; + explicit RefWorkloadFactory(); + virtual ~RefWorkloadFactory() {} virtual Compute GetCompute() const override { return Compute::CpuRef; } - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const override { return false; } @@ -43,7 +45,7 @@ public: { boost::ignore_unused(parent, subTensorShape, subTensorOrigin); return nullptr; - }; + } virtual std::unique_ptr<ITensorHandle> CreateTensorHandle(const TensorInfo& tensorInfo) const override; @@ -113,12 +115,20 @@ public: virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const override; + virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + + virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const override; + private: template <typename F32Workload, typename U8Workload, typename QueueDescriptorType> std::unique_ptr<IWorkload> MakeWorkload(const QueueDescriptorType& descriptor, const WorkloadInfo& info) const; - const bool m_OperationWorkloadsAllowed; }; } // namespace armnn diff --git a/src/armnn/backends/RefWorkloads.hpp b/src/armnn/backends/RefWorkloads.hpp index ed4fa840da..1defdbbe82 100644 --- a/src/armnn/backends/RefWorkloads.hpp +++ b/src/armnn/backends/RefWorkloads.hpp @@ -52,3 +52,6 @@ #include "backends/RefWorkloads/Pooling2d.hpp" #include "backends/RefWorkloads/RefFakeQuantizationFloat32Workload.hpp" #include "backends/RefWorkloads/RefPermuteWorkload.hpp" +#include "backends/RefWorkloads/RefLstmFloat32Workload.hpp" +#include "backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp" +#include "backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp" diff --git a/src/armnn/backends/RefWorkloads/Activation.cpp b/src/armnn/backends/RefWorkloads/Activation.cpp index ede283cbf9..fdb6091ad7 100644 --- a/src/armnn/backends/RefWorkloads/Activation.cpp +++ b/src/armnn/backends/RefWorkloads/Activation.cpp @@ -24,7 +24,7 @@ void Activation(const float* in, float input = in[i]; float output; - // compute the result of the activation function + // Compute the result of the activation function. switch (function) { case ActivationFunction::Linear: diff --git a/src/armnn/backends/RefWorkloads/Activation.hpp b/src/armnn/backends/RefWorkloads/Activation.hpp index 874441c862..4ee604b462 100644 --- a/src/armnn/backends/RefWorkloads/Activation.hpp +++ b/src/armnn/backends/RefWorkloads/Activation.hpp @@ -9,7 +9,7 @@ namespace armnn { -/// Performs the ActivationFunction elementwise on the inputs to give the outputs +/// Performs the ActivationFunction elementwise on the inputs to give the outputs. void Activation(const float* in, float* out, const TensorInfo& tensorInfo, diff --git a/src/armnn/backends/RefWorkloads/Broadcast.hpp b/src/armnn/backends/RefWorkloads/Broadcast.hpp index b65b57f7a1..bdf03f2a16 100644 --- a/src/armnn/backends/RefWorkloads/Broadcast.hpp +++ b/src/armnn/backends/RefWorkloads/Broadcast.hpp @@ -43,7 +43,7 @@ struct BroadcastLoop } private: - // Struct to hold the dimension data + // Struct to hold the dimension data. struct BroadcastDimensionData { unsigned int m_DimSize; diff --git a/src/armnn/backends/RefWorkloads/ConvImpl.cpp b/src/armnn/backends/RefWorkloads/ConvImpl.cpp index 9ebadacddb..3dcd344101 100644 --- a/src/armnn/backends/RefWorkloads/ConvImpl.cpp +++ b/src/armnn/backends/RefWorkloads/ConvImpl.cpp @@ -46,7 +46,7 @@ int32_t QuantizedMultiplierSmallerThanOne::operator*(int32_t rhs) const int32_t QuantizedMultiplierSmallerThanOne::SaturatingRoundingDoublingHighMul(int32_t a, int32_t b) { - // Check for overflow + // Check for overflow. if (a == b && a == std::numeric_limits<int32_t>::min()) { return std::numeric_limits<int32_t>::max(); diff --git a/src/armnn/backends/RefWorkloads/ConvImpl.hpp b/src/armnn/backends/RefWorkloads/ConvImpl.hpp index 8b66b0b7d2..b7d5d17a8d 100644 --- a/src/armnn/backends/RefWorkloads/ConvImpl.hpp +++ b/src/armnn/backends/RefWorkloads/ConvImpl.hpp @@ -18,7 +18,7 @@ namespace armnn { -/// Performs multiplication of a integer with a multiplier which is less than one, +/// Performs multiplication of an integer with a multiplier which is less than one, /// using quantized integer arithmetic which is consistent with AndroidNN's CPU executor. struct QuantizedMultiplierSmallerThanOne { @@ -28,21 +28,21 @@ public: /// The implementation of this function is adapted from Android NN's QuantizeMultiplierSmallerThanOne(). QuantizedMultiplierSmallerThanOne(float multiplier); - /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne() + /// The implementation of this function is adapted from Android NN's MultiplyByQuantizedMultiplierSmallerThanOne(). int32_t operator*(int32_t rhs) const; private: - /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul() + /// The implementation of this function is adapted from gemmlowp's SaturatingRoundingDoublingHighMul(). static int32_t SaturatingRoundingDoublingHighMul(int32_t a, int32_t b); - /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT() + /// The implementation of this function is adapted from gemmlowp's RoundingDivideByPOT(). static int32_t RoundingDivideByPOT(int32_t x, int exponent); int32_t m_Multiplier; int32_t m_RightShift; }; -/// an implementation shared by normal and depthwise convolution +/// An implementation shared by normal and depthwise convolution. template<typename ConvData, typename InputType, typename BiasType, typename AccumulatorType> static void ConvImpl(ConvData data, const InputType* inputData, @@ -55,6 +55,7 @@ static void ConvImpl(ConvData data, InputType* outputData, float outputScale, int32_t outputOffset, + const TensorInfo& filterInfo, bool depthwise = false) { if (data.m_Parameters.m_BiasEnabled && !biasData) @@ -64,7 +65,6 @@ static void ConvImpl(ConvData data, const TensorInfo& inputInfo0 = GetTensorInfo(data.m_Inputs[0]); const TensorInfo& outputInfo0 = GetTensorInfo(data.m_Outputs[0]); - const TensorInfo& filterInfo = data.m_Weight->GetTensorInfo(); unsigned int depthMult = depthwise ? filterInfo.GetShape()[0] : 1; unsigned int channelsInput = filterInfo.GetShape()[1]; @@ -84,7 +84,7 @@ static void ConvImpl(ConvData data, unsigned int hStride = data.m_Parameters.m_StrideY; unsigned int xStride = data.m_Parameters.m_StrideX; - // the world's least efficient convolution + // The world's least efficient convolution. for (unsigned int batchIdx = 0; batchIdx < batchSize; batchIdx++) { for (unsigned int cOutput = 0; cOutput < channelsOutput; cOutput++) @@ -93,11 +93,11 @@ static void ConvImpl(ConvData data, { for (unsigned int xOutput = 0; xOutput < widthOutput; xOutput++) { - // this loop goes over each output element + // This loop goes over each output element. AccumulatorType sum = AccumulatorType(); - // for depthwise, each output channel corresponds to exactly one input channel - // for normal, must loop over each input channel + // For depthwise, each output channel corresponds to exactly one input channel. + // For normal, must loop over each input channel. for (unsigned int cInput = 0; cInput < (depthwise ? 1 : channelsInput); cInput++) { unsigned int depthwiseMultiplierIdx = 0; @@ -111,11 +111,11 @@ static void ConvImpl(ConvData data, { for (unsigned int xFilter = 0; xFilter < widthFilter; xFilter++) { - // this loop goes over each input element for each output element + // This loop goes over each input element for each output element. unsigned int filterIndex; - // since dimensionality of kernel depends on depthwiseness, so does index + // Since dimensionality of kernel depends on depthwiseness, so does index. if (depthwise) { filterIndex = depthwiseMultiplierIdx * widthFilter * heightFilter * channelsInput + @@ -138,7 +138,7 @@ static void ConvImpl(ConvData data, AccumulatorType inputValue; - // check if we're in the padding + // Check if we're in the padding. if (yInput < paddingTop || yInput >= heightInput + paddingTop || xInput < paddingLeft || xInput >= widthInput + paddingLeft ) { diff --git a/src/armnn/backends/RefWorkloads/FullyConnected.cpp b/src/armnn/backends/RefWorkloads/FullyConnected.cpp index 8ba11d19c6..1a8263b9a1 100644 --- a/src/armnn/backends/RefWorkloads/FullyConnected.cpp +++ b/src/armnn/backends/RefWorkloads/FullyConnected.cpp @@ -18,11 +18,11 @@ void FullyConnected(const float* inputData, const float* biasData, bool transposeWeights) { - unsigned int N = outputTensorInfo.GetShape()[1]; // Output Vector Size + unsigned int N = outputTensorInfo.GetShape()[1]; // Outputs Vector Size. - BOOST_ASSERT(inputTensorInfo.GetNumDimensions() > 1); // Need some data + BOOST_ASSERT(inputTensorInfo.GetNumDimensions() > 1); // Needs some data. - unsigned int K = 1; // Total number of activations in the input + unsigned int K = 1; // Total number of activations in the input. for (unsigned int i = 1; i < inputTensorInfo.GetNumDimensions(); i++) { K *= inputTensorInfo.GetShape()[i]; diff --git a/src/armnn/backends/RefWorkloads/FullyConnected.hpp b/src/armnn/backends/RefWorkloads/FullyConnected.hpp index 9fa2456110..fa6f54a3ec 100644 --- a/src/armnn/backends/RefWorkloads/FullyConnected.hpp +++ b/src/armnn/backends/RefWorkloads/FullyConnected.hpp @@ -10,7 +10,7 @@ namespace armnn { -/// Performs a matrix multiplication and optionally adds a bias +/// Performs a matrix multiplication and optionally adds a bias. void FullyConnected(const float* inputData, float* outputData, const TensorInfo& inputTensorInfo, diff --git a/src/armnn/backends/RefWorkloads/Merger.hpp b/src/armnn/backends/RefWorkloads/Merger.hpp index 7d1bfab557..1294d05e08 100644 --- a/src/armnn/backends/RefWorkloads/Merger.hpp +++ b/src/armnn/backends/RefWorkloads/Merger.hpp @@ -29,7 +29,7 @@ void Merger(const MergerQueueDescriptor& data) for (unsigned int i=0; i<outputInfo0.GetNumDimensions(); i++) { dimensionStride /= outputInfo0.GetShape()[i]; - indices[i] = indexRemainder / dimensionStride; // use integer division to round down + indices[i] = indexRemainder / dimensionStride; // Use integer division to round down. indexRemainder -= indices[i] * dimensionStride; } @@ -37,11 +37,11 @@ void Merger(const MergerQueueDescriptor& data) { MergerQueueDescriptor::ViewOrigin const& view = data.m_ViewOrigins[viewIdx]; - //split view extents are defined by the size of (the corresponding) input tensor + //Split view extents are defined by the size of (the corresponding) input tensor. const TensorInfo& inputInfo = GetTensorInfo(data.m_Inputs[viewIdx]); BOOST_ASSERT(inputInfo.GetNumDimensions() == outputInfo0.GetNumDimensions()); - // check all dimensions to see if this element is inside the given input view + // Check all dimensions to see if this element is inside the given input view. bool insideView = true; for (unsigned int i=0; i<inputInfo.GetNumDimensions(); i++) { @@ -66,13 +66,13 @@ void Merger(const MergerQueueDescriptor& data) dimensionStride *= inputInfo.GetShape()[i]; } - //we are within the view, copy input data to the output corresponding to this view + //We are within the view, copy input data to the output corresponding to this view. (GetOutputTensorData<DataType>(0, data))[index] = (GetInputTensorData<DataType>(viewIdx, data))[inIndex]; - //what should we do if input views overlap on the output tensor? - //we could error, take the average, or shm else... - //for now just stop after finding first view (input) that matches. + //What should we do if input views overlap on the output tensor? + //We could error, take the average, or shm else... + //For now just stop after finding first view (input) that matches. break; } } diff --git a/src/armnn/backends/RefWorkloads/Pooling2d.cpp b/src/armnn/backends/RefWorkloads/Pooling2d.cpp index a643e67690..4047f061b3 100644 --- a/src/armnn/backends/RefWorkloads/Pooling2d.cpp +++ b/src/armnn/backends/RefWorkloads/Pooling2d.cpp @@ -164,7 +164,7 @@ void Pooling2d(const float* in, Executor execute = GetExecutor(params.m_PoolType); // Check supported padding methods outside the loop to simplify - // the inner loop + // the inner loop. if (params.m_PaddingMethod != PaddingMethod::Exclude && params.m_PaddingMethod != PaddingMethod::IgnoreValue) { @@ -192,7 +192,7 @@ void Pooling2d(const float* in, float result = defaultInitializer; float poolAreaSize = boost::numeric_cast<float>((hend - hstart) * (wend - wstart)); - // special case: when the pooling kernel is over a padding region and the padding + // Special case: when the pooling kernel is over a padding region and the padding // size is larger or equal to the kernel and the kernel only covers // padding and no real values, then we initialize the result as zero // by convention. This is because we need to choose a value here and @@ -208,8 +208,8 @@ void Pooling2d(const float* in, if (clamped && params.m_PaddingMethod == PaddingMethod::Exclude) { - // when we exclude the padding, it means we calculate with a smaller - // kernel size, so I change the divisor here + // When we exclude the padding, it means we calculate with a smaller + // kernel size, so I changed the divisor here. poolAreaSize = boost::numeric_cast<float>((hend - hstart) * (wend - wstart)); } diff --git a/src/armnn/backends/RefWorkloads/Pooling2d.hpp b/src/armnn/backends/RefWorkloads/Pooling2d.hpp index f88b1a0a4e..cefd022fb3 100644 --- a/src/armnn/backends/RefWorkloads/Pooling2d.hpp +++ b/src/armnn/backends/RefWorkloads/Pooling2d.hpp @@ -11,7 +11,7 @@ namespace armnn { -/// Computes the Pooling2d operation +/// Computes the Pooling2d operation. void Pooling2d(const float* in, float* out, const TensorInfo& inputInfo, diff --git a/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp b/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp index 0ede46d9fb..9044fca1c2 100644 --- a/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp +++ b/src/armnn/backends/RefWorkloads/RefBaseConstantWorkload.hpp @@ -13,7 +13,7 @@ namespace armnn { -// Base class template providing an implementation of the Constant layer common to all data types +// Base class template providing an implementation of the Constant layer common to all data types. template <armnn::DataType DataType> class RefBaseConstantWorkload : public TypedWorkload<ConstantQueueDescriptor, DataType> { diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp index c421b0f212..fbc1f07111 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.cpp @@ -12,15 +12,22 @@ namespace armnn { +RefBatchNormalizationFloat32Workload::RefBatchNormalizationFloat32Workload( + const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload<BatchNormalizationQueueDescriptor>(descriptor, info), + m_Mean(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Mean))), + m_Variance(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Variance))), + m_Beta(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Beta))), + m_Gamma(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Gamma))) {} void RefBatchNormalizationFloat32Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchNormalizationFloat32Workload_Execute"); - const float* var = m_Data.m_Variance->GetConstTensor<float>(); - const float* mean = m_Data.m_Mean->GetConstTensor<float>(); - const float* gamma = m_Data.m_Gamma->GetConstTensor<float>(); - const float* beta = m_Data.m_Beta->GetConstTensor<float>(); + const float* var = m_Variance->GetConstTensor<float>(); + const float* mean = m_Mean->GetConstTensor<float>(); + const float* gamma = m_Gamma->GetConstTensor<float>(); + const float* beta = m_Beta->GetConstTensor<float>(); auto inputData = GetInputTensorDataFloat(0, m_Data); auto outputData = GetOutputTensorDataFloat(0, m_Data); diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp index cbcdadd749..780c329cc6 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationFloat32Workload.hpp @@ -14,8 +14,15 @@ namespace armnn class RefBatchNormalizationFloat32Workload : public Float32Workload<BatchNormalizationQueueDescriptor> { public: - using Float32Workload<BatchNormalizationQueueDescriptor>::Float32Workload; + explicit RefBatchNormalizationFloat32Workload(const BatchNormalizationQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Mean; + std::unique_ptr<ScopedCpuTensorHandle> m_Variance; + std::unique_ptr<ScopedCpuTensorHandle> m_Beta; + std::unique_ptr<ScopedCpuTensorHandle> m_Gamma; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp index 8a48523765..4a8e296619 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.cpp @@ -14,23 +14,30 @@ namespace armnn { +RefBatchNormalizationUint8Workload::RefBatchNormalizationUint8Workload( + const BatchNormalizationQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload<BatchNormalizationQueueDescriptor>(descriptor, info), + m_Mean(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Mean))), + m_Variance(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Variance))), + m_Beta(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Beta))), + m_Gamma(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Gamma))) {} void RefBatchNormalizationUint8Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefBatchNormalizationUint8Workload_Execute"); const TensorInfo& inputInfo0 = GetTensorInfo(m_Data.m_Inputs[0]); - const TensorInfo& varInfo = GetTensorInfo(m_Data.m_Variance); - const TensorInfo& meanInfo = GetTensorInfo(m_Data.m_Mean); - const TensorInfo& gammaInfo = GetTensorInfo(m_Data.m_Gamma); - const TensorInfo& betaInfo = GetTensorInfo(m_Data.m_Beta); + const TensorInfo& varInfo = GetTensorInfo(m_Variance.get()); + const TensorInfo& meanInfo = GetTensorInfo(m_Mean.get()); + const TensorInfo& gammaInfo = GetTensorInfo(m_Gamma.get()); + const TensorInfo& betaInfo = GetTensorInfo(m_Beta.get()); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); auto input = Dequantize(GetInputTensorDataU8(0, m_Data), inputInfo0); - auto var = Dequantize(m_Data.m_Variance->GetConstTensor<uint8_t>(), varInfo); - auto mean = Dequantize(m_Data.m_Mean->GetConstTensor<uint8_t>(), meanInfo); - auto gamma = Dequantize(m_Data.m_Gamma->GetConstTensor<uint8_t>(), gammaInfo); - auto beta = Dequantize(m_Data.m_Beta->GetConstTensor<uint8_t>(), betaInfo); + auto var = Dequantize(m_Variance->GetConstTensor<uint8_t>(), varInfo); + auto mean = Dequantize(m_Mean->GetConstTensor<uint8_t>(), meanInfo); + auto gamma = Dequantize(m_Gamma->GetConstTensor<uint8_t>(), gammaInfo); + auto beta = Dequantize(m_Beta->GetConstTensor<uint8_t>(), betaInfo); std::vector<float> results(outputInfo.GetNumElements()); BatchNormImpl(m_Data, var.data(), mean.data(), gamma.data(), beta.data(), results.data(), input.data()); diff --git a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp index 57fe995ba5..2c12d28c3f 100644 --- a/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefBatchNormalizationUint8Workload.hpp @@ -14,8 +14,15 @@ namespace armnn class RefBatchNormalizationUint8Workload : public Uint8Workload<BatchNormalizationQueueDescriptor> { public: - using Uint8Workload<BatchNormalizationQueueDescriptor>::Uint8Workload; + explicit RefBatchNormalizationUint8Workload(const BatchNormalizationQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Mean; + std::unique_ptr<ScopedCpuTensorHandle> m_Variance; + std::unique_ptr<ScopedCpuTensorHandle> m_Beta; + std::unique_ptr<ScopedCpuTensorHandle> m_Gamma; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp new file mode 100644 index 0000000000..c4b78014b2 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.cpp @@ -0,0 +1,25 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "RefConvertFp16ToFp32Workload.hpp" +#include "Half.hpp" +#include "RefWorkloadUtils.hpp" +#include "FloatingPointConverter.hpp" + +namespace armnn +{ + +void RefConvertFp16ToFp32Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp16ToFp32Workload_Execute"); + + const Half* const input = GetInputTensorDataHalf(0, m_Data); + float* const output = GetOutputTensorDataFloat(0, m_Data); + + unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements(); + armnnUtils::FloatingPointConverter::ConvertFloat16To32(input, numElements, output); +} + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp new file mode 100644 index 0000000000..34ae35545b --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp16ToFp32Workload.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class RefConvertFp16ToFp32Workload : public Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor> +{ +public: + using Float16ToFloat32Workload<ConvertFp16ToFp32QueueDescriptor>::Float16ToFloat32Workload; + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp new file mode 100644 index 0000000000..3c93297302 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.cpp @@ -0,0 +1,29 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "RefConvertFp32ToFp16Workload.hpp" + +#include "Half.hpp" +#include "FloatingPointConverter.hpp" +#include "RefWorkloadUtils.hpp" + +#include "Profiling.hpp" + +namespace armnn +{ + +void RefConvertFp32ToFp16Workload::Execute() const +{ + ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefConvertFp32ToFp16Workload_Execute"); + + const float* const input = GetInputTensorDataFloat(0, m_Data); + Half* const output = GetOutputTensorDataHalf(0, m_Data); + + // convert Fp32 input to Fp16 output + unsigned int numElements = GetTensorInfo(m_Data.m_Inputs[0]).GetNumElements(); + armnnUtils::FloatingPointConverter::ConvertFloat32To16(input, numElements, output); +} + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp new file mode 100644 index 0000000000..903a50449f --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefConvertFp32ToFp16Workload.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class RefConvertFp32ToFp16Workload : public Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor> +{ +public: + using Float32ToFloat16Workload<ConvertFp32ToFp16QueueDescriptor>::Float32ToFloat16Workload; + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp index 6e4cc69063..4fe823a288 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefConvolution2dFloat32Workload::RefConvolution2dFloat32Workload( + const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload<Convolution2dQueueDescriptor>(descriptor, info), + m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {} void RefConvolution2dFloat32Workload::Execute() const { @@ -19,12 +25,13 @@ void RefConvolution2dFloat32Workload::Execute() const float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); - const float* weightData = m_Data.m_Weight->template GetConstTensor<float>(); + const float* weightData = m_Weight->template GetConstTensor<float>(); const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor<float>() : nullptr; + m_Bias->template GetConstTensor<float>() : nullptr; + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl<armnn::Convolution2dQueueDescriptor, float, float, float>( - m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0); + m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp index 514369c262..ecf0082f33 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dFloat32Workload.hpp @@ -14,8 +14,14 @@ namespace armnn class RefConvolution2dFloat32Workload : public Float32Workload<Convolution2dQueueDescriptor> { public: - using Float32Workload<Convolution2dQueueDescriptor>::Float32Workload; + explicit RefConvolution2dFloat32Workload(const Convolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Weight; + std::unique_ptr<ScopedCpuTensorHandle> m_Bias; + }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp index f390baa387..19e9c2ed0a 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefConvolution2dUint8Workload::RefConvolution2dUint8Workload( + const Convolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload<Convolution2dQueueDescriptor>(descriptor, info), + m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {} void RefConvolution2dUint8Workload::Execute() const { @@ -19,20 +25,21 @@ void RefConvolution2dUint8Workload::Execute() const const uint8_t* inputData = GetInputTensorDataU8(0, m_Data); const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); - const uint8_t* weightsData = m_Data.m_Weight->template GetConstTensor<uint8_t>(); - const TensorInfo& weightsInfo = GetTensorInfo(m_Data.m_Weight); + const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>(); + const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get()); const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor<int32_t>() : + m_Bias->template GetConstTensor<int32_t>() : nullptr; uint8_t* outputData = GetOutputTensorDataU8(0, m_Data); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl<armnn::Convolution2dQueueDescriptor, uint8_t, int32_t, int32_t>( m_Data, inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(), biasData, - outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset()); + outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp index 954a206463..733d2052b2 100644 --- a/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefConvolution2dUint8Workload.hpp @@ -14,8 +14,15 @@ namespace armnn class RefConvolution2dUint8Workload : public Uint8Workload<Convolution2dQueueDescriptor> { public: - using Uint8Workload<Convolution2dQueueDescriptor>::Uint8Workload; + explicit RefConvolution2dUint8Workload(const Convolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); + virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Weight; + std::unique_ptr<ScopedCpuTensorHandle> m_Bias; + }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp index c631fecb66..f3167e299a 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefDepthwiseConvolution2dFloat32Workload::RefDepthwiseConvolution2dFloat32Workload( + const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info), + m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {} void RefDepthwiseConvolution2dFloat32Workload::Execute() const { @@ -19,12 +25,13 @@ void RefDepthwiseConvolution2dFloat32Workload::Execute() const float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); - const float* weightData = m_Data.m_Weight->template GetConstTensor<float>(); + const float* weightData = m_Weight->template GetConstTensor<float>(); const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor<float>() : nullptr; + m_Bias->template GetConstTensor<float>() : nullptr; + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl<armnn::DepthwiseConvolution2dQueueDescriptor, float, float, float> - (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, true); + (m_Data, inputData, 0.0f, 0, weightData, 0.0f, 0, biasData, outputData, 0.0f, 0, filterInfo, true); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp index 34e6524684..042e7b3c0a 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dFloat32Workload.hpp @@ -14,8 +14,14 @@ namespace armnn class RefDepthwiseConvolution2dFloat32Workload : public Float32Workload<DepthwiseConvolution2dQueueDescriptor> { public: - using Float32Workload<DepthwiseConvolution2dQueueDescriptor>::Float32Workload; + explicit RefDepthwiseConvolution2dFloat32Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); + virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Weight; + std::unique_ptr<ScopedCpuTensorHandle> m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp index 5a8fb13112..fd5ade5559 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.cpp @@ -13,26 +13,34 @@ namespace armnn { +RefDepthwiseConvolution2dUint8Workload::RefDepthwiseConvolution2dUint8Workload( + const DepthwiseConvolution2dQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload<DepthwiseConvolution2dQueueDescriptor>(descriptor, info), + m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {} + void RefDepthwiseConvolution2dUint8Workload::Execute() const { ARMNN_SCOPED_PROFILING_EVENT(Compute::CpuRef, "RefDepthwiseConvolution2dUint8Workload_Execute"); const uint8_t* inputData = GetInputTensorDataU8(0, m_Data); const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); - const uint8_t* weightsData = m_Data.m_Weight->template GetConstTensor<uint8_t>(); - const TensorInfo& weightsInfo = GetTensorInfo(m_Data.m_Weight); + const uint8_t* weightsData = m_Weight->template GetConstTensor<uint8_t>(); + const TensorInfo& weightsInfo = GetTensorInfo(m_Weight.get()); const int32_t* biasData = m_Data.m_Parameters.m_BiasEnabled ? - m_Data.m_Bias->template GetConstTensor<int32_t>() : + m_Bias->template GetConstTensor<int32_t>() : nullptr; uint8_t* outputData = GetOutputTensorDataU8(0, m_Data); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); + const TensorInfo& filterInfo = m_Weight->GetTensorInfo(); ConvImpl<armnn::DepthwiseConvolution2dQueueDescriptor, uint8_t, int32_t, int32_t>( m_Data, inputData, inputInfo.GetQuantizationScale(), inputInfo.GetQuantizationOffset(), weightsData, weightsInfo.GetQuantizationScale(), weightsInfo.GetQuantizationOffset(), biasData, - outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), true); + outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), filterInfo, true); } } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp index bd9945f529..2c8ed2d084 100644 --- a/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefDepthwiseConvolution2dUint8Workload.hpp @@ -14,8 +14,13 @@ namespace armnn class RefDepthwiseConvolution2dUint8Workload : public Uint8Workload<DepthwiseConvolution2dQueueDescriptor> { public: - using Uint8Workload<DepthwiseConvolution2dQueueDescriptor>::Uint8Workload; + explicit RefDepthwiseConvolution2dUint8Workload(const DepthwiseConvolution2dQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Weight; + std::unique_ptr<ScopedCpuTensorHandle> m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp index 6fe203e5f0..818455e0e9 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.cpp @@ -12,6 +12,12 @@ namespace armnn { +RefFullyConnectedFloat32Workload::RefFullyConnectedFloat32Workload( + const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) + : Float32Workload<FullyConnectedQueueDescriptor>(descriptor, info), + m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {} void RefFullyConnectedFloat32Workload::Execute() const { @@ -22,8 +28,8 @@ void RefFullyConnectedFloat32Workload::Execute() const float* outputData = GetOutputTensorDataFloat(0, m_Data); const float* inputData = GetInputTensorDataFloat(0, m_Data); - const float* weightData = m_Data.m_Weight->GetConstTensor<float>(); - const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Data.m_Bias->GetConstTensor<float>() : nullptr; + const float* weightData = m_Weight->GetConstTensor<float>(); + const float* biasData = m_Data.m_Parameters.m_BiasEnabled ? m_Bias->GetConstTensor<float>() : nullptr; FullyConnected(inputData, outputData, diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp index cb835bd2ce..639d935a16 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedFloat32Workload.hpp @@ -14,8 +14,13 @@ namespace armnn class RefFullyConnectedFloat32Workload : public Float32Workload<FullyConnectedQueueDescriptor> { public: - using Float32Workload<FullyConnectedQueueDescriptor>::Float32Workload; + explicit RefFullyConnectedFloat32Workload(const FullyConnectedQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Weight; + std::unique_ptr<ScopedCpuTensorHandle> m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp index 0186d3f5e5..cd653657e1 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.cpp @@ -14,6 +14,12 @@ namespace armnn { +RefFullyConnectedUint8Workload::RefFullyConnectedUint8Workload( + const FullyConnectedQueueDescriptor& descriptor, const WorkloadInfo& info) + : Uint8Workload<FullyConnectedQueueDescriptor>(descriptor, info), + m_Weight(std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Weight))), + m_Bias(descriptor.m_Parameters.m_BiasEnabled + ? std::make_unique<ScopedCpuTensorHandle>(*(descriptor.m_Bias)) : nullptr) {} void RefFullyConnectedUint8Workload::Execute() const { @@ -22,18 +28,18 @@ void RefFullyConnectedUint8Workload::Execute() const const TensorInfo& inputInfo = GetTensorInfo(m_Data.m_Inputs[0]); const TensorInfo& outputInfo = GetTensorInfo(m_Data.m_Outputs[0]); - const uint8_t* weightData = m_Data.m_Weight->GetConstTensor<uint8_t>(); + const uint8_t* weightData = m_Weight->GetConstTensor<uint8_t>(); auto dequant = Dequantize(GetInputTensorDataU8(0, m_Data), inputInfo); - auto weight = Dequantize(weightData, m_Data.m_Weight->GetTensorInfo()); + auto weight = Dequantize(weightData, m_Weight->GetTensorInfo()); - std::vector<float> results(inputInfo.GetNumElements()); + std::vector<float> results(outputInfo.GetNumElements()); if (m_Data.m_Parameters.m_BiasEnabled) { - const int32_t* biasData = m_Data.m_Bias->GetConstTensor<int32_t>(); - auto bias = Dequantize(biasData, m_Data.m_Bias->GetTensorInfo()); + const int32_t* biasData = m_Bias->GetConstTensor<int32_t>(); + auto bias = Dequantize(biasData, m_Bias->GetTensorInfo()); FullyConnected(dequant.data(), results.data(), diff --git a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp index cd14ea85e0..36e5f631ad 100644 --- a/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp +++ b/src/armnn/backends/RefWorkloads/RefFullyConnectedUint8Workload.hpp @@ -14,8 +14,13 @@ namespace armnn class RefFullyConnectedUint8Workload : public Uint8Workload<FullyConnectedQueueDescriptor> { public: - using Uint8Workload<FullyConnectedQueueDescriptor>::Uint8Workload; + explicit RefFullyConnectedUint8Workload(const FullyConnectedQueueDescriptor& descriptor, + const WorkloadInfo& info); virtual void Execute() const override; + +private: + std::unique_ptr<ScopedCpuTensorHandle> m_Weight; + std::unique_ptr<ScopedCpuTensorHandle> m_Bias; }; } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp new file mode 100644 index 0000000000..bc33638310 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.cpp @@ -0,0 +1,16 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "RefLstmFloat32Workload.hpp" + +namespace armnn +{ + +void RefLstmFloat32Workload::Execute() const +{ + throw armnn::Exception("No implementation of Lstm in the Ref backend!"); +} + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp new file mode 100644 index 0000000000..0acce4d309 --- /dev/null +++ b/src/armnn/backends/RefWorkloads/RefLstmFloat32Workload.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/Workload.hpp" +#include "backends/WorkloadData.hpp" + +namespace armnn +{ + +class RefLstmFloat32Workload : public Float32Workload<LstmQueueDescriptor> +{ +public: + using Float32Workload<LstmQueueDescriptor>::Float32Workload; + virtual void Execute() const override; +}; + +} //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp b/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp index c743207423..f4dff60ae4 100644 --- a/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp +++ b/src/armnn/backends/RefWorkloads/RefNormalizationFloat32Workload.cpp @@ -17,7 +17,7 @@ namespace armnn { -// Helper function to compute "Within" normalization using Krichevsky 2012: Local Brightness Normalization +// Helper function to compute "Within" normalization using Krichevsky 2012: Local Brightness Normalization. static void NormalizeWithinUingLbr(const float* inputData, float* outputData, const TensorShape& tensorShape, @@ -80,7 +80,7 @@ static void NormalizeWithinUingLbr(const float* inputData, } } -// Helper function to compute "Across" normalization using Krichevsky 2012: Local Brightness Normalization +// Helper function to compute "Across" normalization using Krichevsky 2012: Local Brightness Normalization. void NormalizeAcrossUingLbr(const float* inputData, float* outputData, const TensorShape& tensorShape, diff --git a/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp b/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp index b2bb8fbf3d..93c883d826 100644 --- a/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp +++ b/src/armnn/backends/RefWorkloads/RefPermuteWorkload.cpp @@ -7,6 +7,7 @@ #include "RefWorkloadUtils.hpp" #include <Permute.hpp> +#include "TypeUtils.hpp" namespace armnn { diff --git a/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp b/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp index 088fe819e5..1df735ea55 100644 --- a/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp +++ b/src/armnn/backends/RefWorkloads/RefWorkloadUtils.hpp @@ -9,6 +9,7 @@ #include <armnn/Tensor.hpp> #include <armnn/Types.hpp> +#include <Half.hpp> #include <boost/polymorphic_cast.hpp> @@ -70,6 +71,18 @@ float* GetOutputTensorDataFloat(unsigned int idx, const PayloadType& data) return GetOutputTensorData<float>(idx, data); } +template <typename PayloadType> +const Half* GetInputTensorDataHalf(unsigned int idx, const PayloadType& data) +{ + return GetInputTensorData<Half>(idx, data); +} + +template <typename PayloadType> +Half* GetOutputTensorDataHalf(unsigned int idx, const PayloadType& data) +{ + return GetOutputTensorData<Half>(idx, data); +} + //////////////////////////////////////////// /// u8 helpers //////////////////////////////////////////// diff --git a/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp b/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp index 7b386ed467..d8bca4be44 100644 --- a/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp +++ b/src/armnn/backends/RefWorkloads/ResizeBilinear.cpp @@ -27,7 +27,7 @@ inline float Lerp(float a, float b, float w) void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, const TensorInfo& outputInfo) { - // We follow the definition of TensorFlow and AndroidNN: The top-left corner of a texel in the output + // We follow the definition of TensorFlow and AndroidNN: the top-left corner of a texel in the output // image is projected into the input image to figure out the interpolants and weights. Note that this // will yield different results than if projecting the centre of output texels. @@ -39,8 +39,8 @@ void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, co const unsigned int outputHeight = outputInfo.GetShape()[2]; const unsigned int outputWidth = outputInfo.GetShape()[3]; - // How much to scale pixel coordinates in the output image to get the corresponding pixel coordinates - // in the input image + // How much to scale pixel coordinates in the output image, to get the corresponding pixel coordinates + // in the input image. const float scaleY = boost::numeric_cast<float>(inputHeight) / boost::numeric_cast<float>(outputHeight); const float scaleX = boost::numeric_cast<float>(inputWidth) / boost::numeric_cast<float>(outputWidth); @@ -53,33 +53,33 @@ void ResizeBilinear(const float* in, const TensorInfo& inputInfo, float* out, co { for (unsigned int y = 0; y < outputHeight; ++y) { - // Corresponding real-valued height coordinate in input image + // Corresponding real-valued height coordinate in input image. const float iy = boost::numeric_cast<float>(y) * scaleY; - // Discrete height coordinate of top-left texel (in the 2x2 texel area used for interpolation) + // Discrete height coordinate of top-left texel (in the 2x2 texel area used for interpolation). const float fiy = floorf(iy); const unsigned int y0 = boost::numeric_cast<unsigned int>(fiy); - // Interpolation weight (range [0,1]) + // Interpolation weight (range [0,1]). const float yw = iy - fiy; for (unsigned int x = 0; x < outputWidth; ++x) { - // Real-valued and discrete width coordinates in input image + // Real-valued and discrete width coordinates in input image. const float ix = boost::numeric_cast<float>(x) * scaleX; const float fix = floorf(ix); const unsigned int x0 = boost::numeric_cast<unsigned int>(fix); - // Interpolation weight (range [0,1]) + // Interpolation weight (range [0,1]). const float xw = ix - fix; - // Discrete width/height coordinates of texels below and to the right of (x0, y0) + // Discrete width/height coordinates of texels below and to the right of (x0, y0). const unsigned int x1 = std::min(x0 + 1, inputWidth - 1u); const unsigned int y1 = std::min(y0 + 1, inputHeight - 1u); // Interpolation - const float ly0 = Lerp(input.Get(n, c, y0, x0), input.Get(n, c, y0, x1), xw); // lerp along row y0 - const float ly1 = Lerp(input.Get(n, c, y1, x0), input.Get(n, c, y1, x1), xw); // lerp along row y1 + const float ly0 = Lerp(input.Get(n, c, y0, x0), input.Get(n, c, y0, x1), xw); // lerp along row y0. + const float ly1 = Lerp(input.Get(n, c, y1, x0), input.Get(n, c, y1, x1), xw); // lerp along row y1. const float l = Lerp(ly0, ly1, yw); output.Get(n, c, y, x) = l; diff --git a/src/armnn/backends/RefWorkloads/Softmax.cpp b/src/armnn/backends/RefWorkloads/Softmax.cpp index 58840e3076..c9f0bc5e59 100644 --- a/src/armnn/backends/RefWorkloads/Softmax.cpp +++ b/src/armnn/backends/RefWorkloads/Softmax.cpp @@ -11,13 +11,13 @@ namespace armnn { -/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo +/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo. void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float beta) { unsigned int numChannels = tensorInfo.GetShape()[1]; for (unsigned int n = 0; n < tensorInfo.GetShape()[0]; n++) { - // find maximum channel + // Find maximum channel. float max = in[n * numChannels]; for (unsigned int c = 1; c < numChannels; c++) { @@ -28,7 +28,7 @@ void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float be } } - // exponentiate all values and sum + // Exponentiate all values and sum. std::vector<float> exponentials(numChannels); float sum = 0.0f; for (unsigned int c = 0; c < numChannels; c++) @@ -38,7 +38,7 @@ void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float be sum += exponentials[c]; } - // divide exponentials by sum to give outputs + // Divide exponentials by sum to give outputs. for (unsigned int c = 0; c < numChannels; c++) { out[n * numChannels + c] = exponentials[c] / sum; diff --git a/src/armnn/backends/RefWorkloads/Softmax.hpp b/src/armnn/backends/RefWorkloads/Softmax.hpp index c508ab2b82..f75388dc2b 100644 --- a/src/armnn/backends/RefWorkloads/Softmax.hpp +++ b/src/armnn/backends/RefWorkloads/Softmax.hpp @@ -10,7 +10,7 @@ namespace armnn { -/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo +/// Computes the softmax function on some inputs, into outputs, with a shape given by tensorInfo. void Softmax(const float* in, float* out, const TensorInfo& tensorInfo, float beta); } //namespace armnn diff --git a/src/armnn/backends/RefWorkloads/Splitter.hpp b/src/armnn/backends/RefWorkloads/Splitter.hpp index bd5da6cfe2..c12d9368bf 100644 --- a/src/armnn/backends/RefWorkloads/Splitter.hpp +++ b/src/armnn/backends/RefWorkloads/Splitter.hpp @@ -31,7 +31,7 @@ void Splitter(const SplitterQueueDescriptor& data) for (unsigned int i = 0; i<inputInfo0.GetNumDimensions(); i++) { dimensionStride /= inputInfo0.GetShape()[i]; - indices[i] = indexRemainder / dimensionStride; // use integer division to round down + indices[i] = indexRemainder / dimensionStride; // Use integer division to round down. indexRemainder -= indices[i] * dimensionStride; } @@ -39,11 +39,11 @@ void Splitter(const SplitterQueueDescriptor& data) { SplitterQueueDescriptor::ViewOrigin const& view = data.m_ViewOrigins[viewIdx]; - //split view extents are defined by the size of (the corresponding) input tensor + //Split view extents are defined by the size of (the corresponding) input tensor. const TensorInfo& outputInfo = GetTensorInfo(data.m_Outputs[viewIdx]); BOOST_ASSERT(outputInfo.GetNumDimensions() == inputInfo0.GetNumDimensions()); - // check all dimensions to see if this element is inside the given input view + // Check all dimensions to see if this element is inside the given input view. bool insideView = true; for (unsigned int i = 0; i<outputInfo.GetNumDimensions(); i++) { @@ -68,7 +68,7 @@ void Splitter(const SplitterQueueDescriptor& data) dimensionStride *= outputInfo.GetShape()[i]; } - //we are within the view, copy input data to the output corresponding to this view + //We are within the view, to copy input data to the output corresponding to this view. DataType* outputData = GetOutputTensorData<DataType>(viewIdx, data); BOOST_ASSERT(outputData); diff --git a/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp b/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp index 3994c1f1de..ad0f38e867 100644 --- a/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp +++ b/src/armnn/backends/RefWorkloads/TensorBufferArrayView.hpp @@ -10,7 +10,7 @@ namespace armnn { -// Utility class providing access to raw tensor memory based on indices along each dimension +// Utility class providing access to raw tensor memory based on indices along each dimension. template <typename DataType> class TensorBufferArrayView { diff --git a/src/armnn/backends/Workload.hpp b/src/armnn/backends/Workload.hpp index dbc7574d0e..5da03bc61d 100644 --- a/src/armnn/backends/Workload.hpp +++ b/src/armnn/backends/Workload.hpp @@ -12,11 +12,11 @@ namespace armnn { -// Workload interface to enqueue a layer computation +// Workload interface to enqueue a layer computation. class IWorkload { public: - virtual ~IWorkload(){}; + virtual ~IWorkload() {} virtual void Execute() const = 0; }; @@ -46,7 +46,8 @@ protected: const QueueDescriptor m_Data; }; -template <typename QueueDescriptor, armnn::DataType DataType> +// TypedWorkload used +template <typename QueueDescriptor, armnn::DataType... DataTypes> class TypedWorkload : public BaseWorkload<QueueDescriptor> { public: @@ -54,27 +55,93 @@ public: TypedWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info) : BaseWorkload<QueueDescriptor>(descriptor, info) { + std::vector<armnn::DataType> dataTypes = {DataTypes...}; + armnn::DataType expectedInputType; + + if (!info.m_InputTensorInfos.empty()) + { + expectedInputType = info.m_InputTensorInfos.front().GetDataType(); + + if (std::find(dataTypes.begin(), dataTypes.end(), expectedInputType) == dataTypes.end()) + { + BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type"); + } + BOOST_ASSERT_MSG(std::all_of(std::next(info.m_InputTensorInfos.begin()), + info.m_InputTensorInfos.end(), + [&](auto it){ + return it.GetDataType() == expectedInputType; + }), + "Trying to create workload with incorrect type"); + } + armnn::DataType expectedOutputType; + + if (!info.m_OutputTensorInfos.empty()) + { + expectedOutputType = info.m_OutputTensorInfos.front().GetDataType(); + + if (!info.m_InputTensorInfos.empty()) + { + if (expectedOutputType != expectedInputType) + { + BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type"); + } + } + else if (std::find(dataTypes.begin(), dataTypes.end(), expectedOutputType) == dataTypes.end()) + { + BOOST_ASSERT_MSG(false, "Trying to create workload with incorrect type"); + } + BOOST_ASSERT_MSG(std::all_of(std::next(info.m_OutputTensorInfos.begin()), + info.m_OutputTensorInfos.end(), + [&](auto it){ + return it.GetDataType() == expectedOutputType; + }), + "Trying to create workload with incorrect type"); + } + } +}; + +template <typename QueueDescriptor, armnn::DataType InputDataType, armnn::DataType OutputDataType> +class MultiTypedWorkload : public BaseWorkload<QueueDescriptor> +{ +public: + + MultiTypedWorkload(const QueueDescriptor& descriptor, const WorkloadInfo& info) + : BaseWorkload<QueueDescriptor>(descriptor, info) + { BOOST_ASSERT_MSG(std::all_of(info.m_InputTensorInfos.begin(), info.m_InputTensorInfos.end(), [&](auto it){ - return it.GetDataType() == DataType; + return it.GetDataType() == InputDataType; }), "Trying to create workload with incorrect type"); BOOST_ASSERT_MSG(std::all_of(info.m_OutputTensorInfos.begin(), info.m_OutputTensorInfos.end(), [&](auto it){ - return it.GetDataType() == DataType; + return it.GetDataType() == OutputDataType; }), "Trying to create workload with incorrect type"); } - - static constexpr armnn::DataType ms_DataType = DataType; }; template <typename QueueDescriptor> +using FloatWorkload = TypedWorkload<QueueDescriptor, + armnn::DataType::Float16, + armnn::DataType::Float32>; + +template <typename QueueDescriptor> using Float32Workload = TypedWorkload<QueueDescriptor, armnn::DataType::Float32>; template <typename QueueDescriptor> using Uint8Workload = TypedWorkload<QueueDescriptor, armnn::DataType::QuantisedAsymm8>; +template <typename QueueDescriptor> +using Float16ToFloat32Workload = MultiTypedWorkload<QueueDescriptor, + armnn::DataType::Float16, + armnn::DataType::Float32>; + +template <typename QueueDescriptor> +using Float32ToFloat16Workload = MultiTypedWorkload<QueueDescriptor, + armnn::DataType::Float32, + armnn::DataType::Float16>; + } //namespace armnn diff --git a/src/armnn/backends/WorkloadData.cpp b/src/armnn/backends/WorkloadData.cpp index c951fc5d8d..aa763801ce 100644 --- a/src/armnn/backends/WorkloadData.cpp +++ b/src/armnn/backends/WorkloadData.cpp @@ -22,6 +22,8 @@ DataType GetBiasDataType(DataType inputDataType) { switch (inputDataType) { + case DataType::Float16: + return DataType::Float16; case DataType::Float32: return DataType::Float32; case DataType::QuantisedAsymm8: @@ -148,7 +150,7 @@ void ValidateBiasTensorQuantization(const TensorInfo& biasTensor, const TensorIn to_string(biasTensor.GetQuantizationOffset())); } const float expectedScale = inputTensorInfo.GetQuantizationScale() * weightsTensorInfo.GetQuantizationScale(); - if (biasTensor.GetQuantizationScale() != expectedScale) + if (std::abs(biasTensor.GetQuantizationScale() - expectedScale) > 0.000000001f) { // Print the float values with extra precision to see very small differences std::stringstream msg; @@ -338,11 +340,11 @@ void SplitterQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const ". Number of workloadInfo.m_OutputTensorInfos: " + to_string(workloadInfo.m_OutputTensorInfos.size())); } - //the dimensionality of all the windows has to match the dimensionality (not shape) of the input + //The dimensionality of all the windows has to match the dimensionality (not shape) of the input. std::size_t inputDims = workloadInfo.m_InputTensorInfos[0].GetNumDimensions(); for(unsigned int w = 0; w < m_ViewOrigins.size(); ++w ) { - //check that the dimensionality of input is same as the split windows + //Checks that the dimensionality of input is same as the split windows. ViewOrigin const& e = m_ViewOrigins[w]; if (e.m_Origin.size() != inputDims) { @@ -399,11 +401,11 @@ void MergerQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const ". Number of workloadInfo.m_InputTensorInfos: " + to_string(workloadInfo.m_InputTensorInfos.size())); } - //the dimensionality of all the windows has to match the dimensionality (not shape) of the output + //The dimensionality of all the windows has to match the dimensionality (not shape) of the output. std::size_t outputDims = workloadInfo.m_OutputTensorInfos[0].GetNumDimensions(); for(unsigned int w = 0; w < m_ViewOrigins.size(); ++w ) { - //check that the dimensionality of output is same as the split windows + //Checks that the dimensionality of output is same as the split windows. ViewOrigin const& e = m_ViewOrigins[w]; if (e.m_Origin.size() != outputDims) { @@ -415,7 +417,7 @@ void MergerQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const "tensor has " + to_string(outputDims) + " dimensions."); } - //check that the merge windows are within the output tensor + //Checks that the merge windows are within the output tensor. for (unsigned int i = 0; i < e.m_Origin.size(); ++i) { if (e.m_Origin[i] + workloadInfo.m_InputTensorInfos[w].GetShape()[i] @@ -456,7 +458,7 @@ void FullyConnectedQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c "bias value tensor descriptor is missing."); } - // validate type and quantization values + // Validates type and quantization values. ValidateBiasTensorQuantization(m_Bias->GetTensorInfo(), workloadInfo.m_InputTensorInfos[0], m_Weight->GetTensorInfo(), "FullyConnectedQueueDescriptor"); @@ -578,7 +580,7 @@ void DepthwiseConvolution2dQueueDescriptor::Validate(const WorkloadInfo& workloa ValidatePointer(m_Weight, "DepthwiseConvolution2dQueueDescriptor", "weight"); ValidateTensorNumDimensions(m_Weight->GetTensorInfo(), "DepthwiseConvolution2dQueueDescriptor", 4, "weight"); - //inputChannels * channelMultiplier should be equal to outputChannels + //inputChannels * channelMultiplier should be equal to outputChannels. const unsigned int numWeightChannelMultiplier = m_Weight->GetTensorInfo().GetShape()[0]; const unsigned int numWeightInputChannels = m_Weight->GetTensorInfo().GetShape()[1]; const unsigned int numWeightOutputChannels = workloadInfo.m_OutputTensorInfos[0].GetShape()[1]; @@ -649,7 +651,7 @@ void ResizeBilinearQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) c ValidateTensorNumDimensions(workloadInfo.m_InputTensorInfos[0], "ResizeBilinearQueueDescriptor", 4, "input"); ValidateTensorNumDimensions(workloadInfo.m_OutputTensorInfos[0], "ResizeBilinearQueueDescriptor", 4, "output"); - // Resize bilinear only changes width and height: batch and channel count must match + // Resizes bilinear only changes width and height: batch and channel count must match. { const unsigned int inputBatchSize = workloadInfo.m_InputTensorInfos[0].GetShape()[0]; const unsigned int outputBatchSize = workloadInfo.m_OutputTensorInfos[0].GetShape()[0]; @@ -747,4 +749,53 @@ void FloorQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const } } +void LstmQueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const +{ + ValidateTensorNumDimensions(workloadInfo.m_InputTensorInfos[0], "LstmQueueDescriptor", 2, "input"); + ValidateTensorNumDimensions(workloadInfo.m_OutputTensorInfos[0], "LstmQueueDescriptor", 2, "output"); +} + +void ConvertFp32ToFp16QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const +{ + ValidateSingleInput(workloadInfo, "ConvertFp32ToFp16QueueDescriptor"); + ValidateSingleOutput(workloadInfo, "ConvertFp32ToFp16QueueDescriptor"); + + if (workloadInfo.m_InputTensorInfos[0].GetDataType() != DataType::Float32) + { + throw InvalidArgumentException("ConvertFp32ToFp16QueueDescriptor: Input tensor type must be Float32."); + } + + if (workloadInfo.m_OutputTensorInfos[0].GetDataType() != DataType::Float16) + { + throw InvalidArgumentException("ConvertFp32ToFp16QueueDescriptor: Output tensor type must be Float16."); + } + + ValidateTensorShapesMatch(workloadInfo.m_InputTensorInfos[0], + workloadInfo.m_OutputTensorInfos[0], + "ConvertFp32ToFp16QueueDescriptor", + "input", + "output"); +} + +void ConvertFp16ToFp32QueueDescriptor::Validate(const WorkloadInfo& workloadInfo) const +{ + ValidateSingleInput(workloadInfo, "ConvertFp16ToFp32QueueDescriptor"); + ValidateSingleOutput(workloadInfo, "ConvertFp16ToFp32QueueDescriptor"); + + if (workloadInfo.m_InputTensorInfos[0].GetDataType() != DataType::Float16) + { + throw InvalidArgumentException("ConvertFp16ToFp32QueueDescriptor: Input tensor type must be Float16."); + } + if (workloadInfo.m_OutputTensorInfos[0].GetDataType() != DataType::Float32) + { + throw InvalidArgumentException("ConvertFp16ToFp32QueueDescriptor: Output tensor type must be Float32."); + } + + ValidateTensorShapesMatch(workloadInfo.m_InputTensorInfos[0], + workloadInfo.m_OutputTensorInfos[0], + "ConvertFp16ToFp32QueueDescriptor", + "input", + "output"); +} + } //namespace armnn diff --git a/src/armnn/backends/WorkloadData.hpp b/src/armnn/backends/WorkloadData.hpp index 7f8713582f..db266e6df8 100644 --- a/src/armnn/backends/WorkloadData.hpp +++ b/src/armnn/backends/WorkloadData.hpp @@ -17,7 +17,7 @@ namespace armnn { -//a helper function that returns the bias data type required for given input data type. +//A helper function that returns the bias data type required for given input data type. DataType GetBiasDataType(DataType inputDataType); struct WorkloadInfo; @@ -38,7 +38,7 @@ protected: QueueDescriptor& operator=(QueueDescriptor const&) = default; }; -// Base class for queue descriptors which contain parameters +// Base class for queue descriptors which contain parameters. template <typename LayerDescriptor> struct QueueDescriptorWithParameters : public QueueDescriptor { @@ -59,13 +59,13 @@ struct MemCopyQueueDescriptor : QueueDescriptor using InputQueueDescriptor = MemCopyQueueDescriptor; using OutputQueueDescriptor = MemCopyQueueDescriptor; -// Softmax layer workload data +// Softmax layer workload data. struct SoftmaxQueueDescriptor : QueueDescriptorWithParameters<SoftmaxDescriptor> { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Splitter layer workload data +// Splitter layer workload data. struct SplitterQueueDescriptor : QueueDescriptorWithParameters<ViewsDescriptor> { struct ViewOrigin @@ -73,18 +73,18 @@ struct SplitterQueueDescriptor : QueueDescriptorWithParameters<ViewsDescriptor> ViewOrigin() {} ViewOrigin(std::vector<unsigned int> const& origin) : m_Origin(origin) {} - //view origin (size of the vector is the same as number of dimensions of the view) + //View origin (size of the vector is the same as number of dimensions of the view). std::vector<unsigned int> m_Origin; }; - //view defines a tensor that will be carved from the input tensor. - //view origins are stored here, the extents are defined by sizes of the output tensors. + //View defines a tensor that will be carved from the input tensor. + //View origins are stored here, the extents are defined by sizes of the output tensors. std::vector<ViewOrigin> m_ViewOrigins; void Validate(const WorkloadInfo& workloadInfo) const; }; -// Merger layer workload data +// Merger layer workload data. struct MergerQueueDescriptor : QueueDescriptorWithParameters<OriginsDescriptor> { struct ViewOrigin @@ -92,24 +92,24 @@ struct MergerQueueDescriptor : QueueDescriptorWithParameters<OriginsDescriptor> ViewOrigin() {} ViewOrigin(const std::vector<unsigned int>& origin) : m_Origin(origin) {} - //view origin (size of the vector is the same as number of dimensions of the view) + //View origin (size of the vector is the same as number of dimensions of the view). std::vector<unsigned int> m_Origin; }; - //view defines a sub-area of the output tensor that will be filled with the corresponding input tensor. - //view origins are stored here, the extents are defined by sizes of the input tensors. + //View defines a sub-area of the output tensor that will be filled with the corresponding input tensor. + //View origins are stored here, the extents are defined by sizes of the input tensors. std::vector<ViewOrigin> m_ViewOrigins; void Validate(const WorkloadInfo& workloadInfo) const; }; -// Activation layer workload data +// Activation layer workload data. struct ActivationQueueDescriptor : QueueDescriptorWithParameters<ActivationDescriptor> { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Fully connected layer workload data +// Fully connected layer workload data. struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters<FullyConnectedDescriptor> { FullyConnectedQueueDescriptor() @@ -124,19 +124,19 @@ struct FullyConnectedQueueDescriptor : QueueDescriptorWithParameters<FullyConnec void Validate(const WorkloadInfo& workloadInfo) const; }; -// Permute layer workload data +// Permute layer workload data. struct PermuteQueueDescriptor : QueueDescriptorWithParameters<PermuteDescriptor> { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Pooling 2D layer workload data +// Pooling 2D layer workload data. struct Pooling2dQueueDescriptor : QueueDescriptorWithParameters<Pooling2dDescriptor> { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Convolution 2D layer workload data +// Convolution 2D layer workload data. struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters<Convolution2dDescriptor> { Convolution2dQueueDescriptor() @@ -151,7 +151,7 @@ struct Convolution2dQueueDescriptor : QueueDescriptorWithParameters<Convolution2 void Validate(const WorkloadInfo& workloadInfo) const; }; -// Depthwise Convolution 2D layer workload data +// Depthwise Convolution 2D layer workload data. struct DepthwiseConvolution2dQueueDescriptor : QueueDescriptorWithParameters<DepthwiseConvolution2dDescriptor> { DepthwiseConvolution2dQueueDescriptor() @@ -166,25 +166,25 @@ struct DepthwiseConvolution2dQueueDescriptor : QueueDescriptorWithParameters<Dep void Validate(const WorkloadInfo& workloadInfo) const; }; -// Normalization layer workload data +// Normalization layer workload data. struct NormalizationQueueDescriptor : QueueDescriptorWithParameters<NormalizationDescriptor> { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Add layer workload data +// Add layer workload data. struct AdditionQueueDescriptor : QueueDescriptor { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Multiplication layer workload data +// Multiplication layer workload data. struct MultiplicationQueueDescriptor : QueueDescriptor { void Validate(const WorkloadInfo& workloadInfo) const; }; -// Batch norm layer workload data +// Batch norm layer workload data. struct BatchNormalizationQueueDescriptor : QueueDescriptorWithParameters<BatchNormalizationDescriptor> { BatchNormalizationQueueDescriptor() @@ -249,4 +249,58 @@ struct FloorQueueDescriptor : QueueDescriptor void Validate(const WorkloadInfo& workloadInfo) const; }; +struct LstmQueueDescriptor : QueueDescriptorWithParameters<LstmDescriptor> +{ + LstmQueueDescriptor() + : m_InputToInputWeights(nullptr) + , m_InputToForgetWeights(nullptr) + , m_InputToCellWeights(nullptr) + , m_InputToOutputWeights(nullptr) + , m_RecurrentToInputWeights(nullptr) + , m_RecurrentToForgetWeights(nullptr) + , m_RecurrentToCellWeights(nullptr) + , m_RecurrentToOutputWeights(nullptr) + , m_CellToInputWeights(nullptr) + , m_CellToForgetWeights(nullptr) + , m_CellToOutputWeights(nullptr) + , m_InputGateBias(nullptr) + , m_ForgetGateBias(nullptr) + , m_CellBias(nullptr) + , m_OutputGateBias(nullptr) + , m_ProjectionWeights(nullptr) + , m_ProjectionBias(nullptr) + { + } + + const ConstCpuTensorHandle* m_InputToInputWeights; + const ConstCpuTensorHandle* m_InputToForgetWeights; + const ConstCpuTensorHandle* m_InputToCellWeights; + const ConstCpuTensorHandle* m_InputToOutputWeights; + const ConstCpuTensorHandle* m_RecurrentToInputWeights; + const ConstCpuTensorHandle* m_RecurrentToForgetWeights; + const ConstCpuTensorHandle* m_RecurrentToCellWeights; + const ConstCpuTensorHandle* m_RecurrentToOutputWeights; + const ConstCpuTensorHandle* m_CellToInputWeights; + const ConstCpuTensorHandle* m_CellToForgetWeights; + const ConstCpuTensorHandle* m_CellToOutputWeights; + const ConstCpuTensorHandle* m_InputGateBias; + const ConstCpuTensorHandle* m_ForgetGateBias; + const ConstCpuTensorHandle* m_CellBias; + const ConstCpuTensorHandle* m_OutputGateBias; + const ConstCpuTensorHandle* m_ProjectionWeights; + const ConstCpuTensorHandle* m_ProjectionBias; + + void Validate(const WorkloadInfo& workloadInfo) const; +}; + +struct ConvertFp16ToFp32QueueDescriptor : QueueDescriptor +{ + void Validate(const WorkloadInfo& workloadInfo) const; +}; + +struct ConvertFp32ToFp16QueueDescriptor : QueueDescriptor +{ + void Validate(const WorkloadInfo& workloadInfo) const; +}; + } //namespace armnn diff --git a/src/armnn/backends/WorkloadFactory.cpp b/src/armnn/backends/WorkloadFactory.cpp index 4e94d7701c..1b3f29421a 100644 --- a/src/armnn/backends/WorkloadFactory.cpp +++ b/src/armnn/backends/WorkloadFactory.cpp @@ -20,7 +20,40 @@ namespace armnn { -bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, DataType dataType, +namespace +{ + const TensorInfo OverrideDataType(const TensorInfo& info, boost::optional<DataType> type) + { + if (type == boost::none) + { + return info; + } + + return TensorInfo(info.GetShape(), type.get(), info.GetQuantizationScale(), info.GetQuantizationOffset()); + } + + boost::optional<DataType> GetBiasTypeFromWeightsType(boost::optional<DataType> weightsType) + { + if (weightsType == boost::none) + { + return weightsType; + } + + switch(weightsType.get()) + { + case DataType::Float16: + case DataType::Float32: + return weightsType; + case DataType::QuantisedAsymm8: + return DataType::Signed32; + default: + BOOST_ASSERT_MSG(false, "GetBiasTypeFromWeightsType(): Unsupported data type."); + } + return boost::none; + } +} + +bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, boost::optional<DataType> dataType, std::string& outReasonIfUnsupported) { constexpr size_t reasonCapacity = 1024; @@ -32,7 +65,13 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat { auto cLayer = boost::polymorphic_downcast<const ActivationLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsActivationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsActivationSupported(compute, + OverrideDataType(input, dataType), + OverrideDataType(output, dataType), + cLayer->GetParameters(), + reason, + reasonCapacity); break; } case LayerType::Addition: @@ -40,30 +79,64 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsAdditionSupported(compute, input0, input1, output, reason, reasonCapacity); + result = IsAdditionSupported(compute, + OverrideDataType(input0, dataType), + OverrideDataType(input1, dataType), + OverrideDataType(output, dataType), + reason, + reasonCapacity); break; } case LayerType::BatchNormalization: { auto cLayer = boost::polymorphic_downcast<const BatchNormalizationLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsBatchNormalizationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + const TensorInfo& mean = cLayer->m_Mean->GetTensorInfo(); + const TensorInfo& var = cLayer->m_Variance->GetTensorInfo(); + const TensorInfo& beta = cLayer->m_Beta->GetTensorInfo(); + const TensorInfo& gamma = cLayer->m_Gamma->GetTensorInfo(); + result = IsBatchNormalizationSupported(compute, + OverrideDataType(input, dataType), + OverrideDataType(output, dataType), + OverrideDataType(mean, dataType), + OverrideDataType(var, dataType), + OverrideDataType(beta, dataType), + OverrideDataType(gamma, dataType), + cLayer->GetParameters(), + reason, reasonCapacity); break; } case LayerType::Constant: { const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsConstantSupported(compute, output, reason, reasonCapacity); + result = IsConstantSupported(compute, OverrideDataType(output, dataType), reason, reasonCapacity); break; } - case LayerType::Convolution2d: + case LayerType::ConvertFp16ToFp32: { - auto cLayer = boost::polymorphic_downcast<const Convolution2dLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsConvertFp16ToFp32Supported(compute, input, output, reason, reasonCapacity); + break; + } + case LayerType::ConvertFp32ToFp16: + { + const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsConvertFp32ToFp16Supported(compute, input, output, reason, reasonCapacity); + break; + } + case LayerType::Convolution2d: + { + auto cLayer = boost::polymorphic_downcast<const Convolution2dLayer*>(&layer); + const TensorInfo input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(), dataType); + const TensorInfo output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); BOOST_ASSERT(cLayer->m_Weight.get() != nullptr); - const TensorInfo * biasInfo = nullptr; + TensorInfo biasInfo; + const TensorInfo * biasInfoPtr = nullptr; + static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16); static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32); static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32); @@ -72,21 +145,27 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat if (descriptor.m_BiasEnabled) { BOOST_ASSERT(cLayer->m_Bias.get() != nullptr); - biasInfo = &(cLayer->m_Bias->GetTensorInfo()); + biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType)); + biasInfoPtr = &biasInfo; } else { - // If biases are not enabled I pass a dummy tensorinfo for the validation + // If biases are not enabled pass a dummy tensorinfo for the validation. switch(input.GetDataType()) { + case DataType::Float16: + { + biasInfoPtr = &dummyFloat16Bias; + break; + } case DataType::Float32: { - biasInfo = &dummyFloat32Bias; + biasInfoPtr = &dummyFloat32Bias; break; } case DataType::QuantisedAsymm8: { - biasInfo = &dummyQA8Bias; + biasInfoPtr = &dummyQA8Bias; break; } default: @@ -100,16 +179,16 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat input, output, descriptor, - cLayer->m_Weight->GetTensorInfo(), - *biasInfo, + OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType), + *biasInfoPtr, reason, reasonCapacity); break; } case LayerType::MemCopy: { - // MemCopy supported for CpuRef, CpuAcc and GpuAcc backends - // (also treat Undefined as CpuRef to avoid breaking lots of Unit tests) + // MemCopy supported for CpuRef, CpuAcc and GpuAcc backends, + // (also treat Undefined as CpuRef to avoid breaking lots of Unit tests). result = compute == Compute::CpuRef || compute == Compute::Undefined || compute == Compute::CpuAcc || compute == Compute::GpuAcc; strcpy(reason, "Unsupported backend type"); @@ -118,66 +197,314 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat case LayerType::DepthwiseConvolution2d: { auto cLayer = boost::polymorphic_downcast<const DepthwiseConvolution2dLayer*>(&layer); - const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsDepthwiseConvolutionSupported(compute, input, cLayer->GetParameters(), - cLayer->m_Weight->GetTensorInfo(), reason, reasonCapacity); + const TensorInfo& input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(), + dataType); + const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); + BOOST_ASSERT(cLayer->m_Weight.get() != nullptr); + + TensorInfo biasInfo; + const TensorInfo * biasInfoPtr = nullptr; + static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16); + static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32); + static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32); + + const DepthwiseConvolution2dDescriptor& descriptor = cLayer->GetParameters(); + if (descriptor.m_BiasEnabled) + { + BOOST_ASSERT(cLayer->m_Bias.get() != nullptr); + biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType)); + biasInfoPtr = &biasInfo; + } + else + { + // If biases are not enabled pass a dummy tensorinfo for the validation + switch(input.GetDataType()) + { + case DataType::Float16: + { + biasInfoPtr = &dummyFloat16Bias; + break; + } + case DataType::Float32: + { + biasInfoPtr = &dummyFloat32Bias; + break; + } + case DataType::QuantisedAsymm8: + { + biasInfoPtr = &dummyQA8Bias; + break; + } + default: + { + BOOST_ASSERT_MSG(false, "Unexpected bias type"); + } + } + } + + + result = IsDepthwiseConvolutionSupported(compute, + input, + output, + descriptor, + OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType), + *biasInfoPtr, + reason, + reasonCapacity); break; } case LayerType::FakeQuantization: { auto cLayer = boost::polymorphic_downcast<const FakeQuantizationLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsFakeQuantizationSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + result = IsFakeQuantizationSupported(compute, OverrideDataType(input, dataType), cLayer->GetParameters(), + reason, reasonCapacity); break; } case LayerType::Floor: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsFloorSupported(compute, input, output, reason, reasonCapacity); + result = IsFloorSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType), + reason, reasonCapacity); break; } case LayerType::FullyConnected: { auto cLayer = boost::polymorphic_downcast<const FullyConnectedLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsFullyConnectedSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + BOOST_ASSERT(cLayer->m_Weight.get() != nullptr); + + TensorInfo biasInfo; + const TensorInfo * biasInfoPtr = nullptr; + static const TensorInfo dummyFloat16Bias(TensorShape({1,1,1,1}), DataType::Float16); + static const TensorInfo dummyFloat32Bias(TensorShape({1,1,1,1}), DataType::Float32); + static const TensorInfo dummyQA8Bias(TensorShape({1,1,1,1}), DataType::Signed32); + + const FullyConnectedDescriptor& descriptor = cLayer->GetParameters(); + if (descriptor.m_BiasEnabled) + { + BOOST_ASSERT(cLayer->m_Bias.get() != nullptr); + biasInfo = OverrideDataType(cLayer->m_Bias->GetTensorInfo(), GetBiasTypeFromWeightsType(dataType)); + biasInfoPtr = &biasInfo; + } + else + { + // If biases are not enabled pass a dummy tensorinfo for the validation + switch(input.GetDataType()) + { + case DataType::Float16: + { + biasInfoPtr = &dummyFloat16Bias; + break; + } + case DataType::Float32: + { + biasInfoPtr = &dummyFloat32Bias; + break; + } + case DataType::QuantisedAsymm8: + { + biasInfoPtr = &dummyQA8Bias; + break; + } + default: + { + BOOST_ASSERT_MSG(false, "Unexpected bias type"); + } + } + } + + result = IsFullyConnectedSupported(compute, + OverrideDataType(input, dataType), + OverrideDataType(output, dataType), + OverrideDataType(cLayer->m_Weight->GetTensorInfo(), dataType), + *biasInfoPtr, + descriptor, + reason, + reasonCapacity); break; } case LayerType::Input: { const TensorInfo& input = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsInputSupported(compute, input, reason, reasonCapacity); + result = IsInputSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity); break; } case LayerType::L2Normalization: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsL2NormalizationSupported(compute, input, reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsL2NormalizationSupported(compute, OverrideDataType(input, dataType), + OverrideDataType(output, dataType), reason, reasonCapacity); + break; + } + case LayerType::Lstm: + { + auto cLayer = boost::polymorphic_downcast<const LstmLayer*>(&layer); + const LstmDescriptor& descriptor = cLayer->GetParameters(); + + // All inputs. + const TensorInfo& input = OverrideDataType(layer.GetInputSlot(0).GetConnection()->GetTensorInfo(), + dataType); + const TensorInfo& outputStateIn = OverrideDataType(layer.GetInputSlot(1).GetConnection()->GetTensorInfo(), + dataType); + const TensorInfo& cellStateIn = OverrideDataType(layer.GetInputSlot(2).GetConnection()->GetTensorInfo(), + dataType); + // All outputs + const TensorInfo& scratchBuffer = OverrideDataType(layer.GetOutputSlot(0).GetTensorInfo(), dataType); + const TensorInfo& outputStateOut = OverrideDataType(layer.GetOutputSlot(1).GetTensorInfo(), dataType); + const TensorInfo& cellStateOut = OverrideDataType(layer.GetOutputSlot(2).GetTensorInfo(), dataType); + const TensorInfo& output = OverrideDataType(layer.GetOutputSlot(3).GetTensorInfo(), dataType); + + // Basic parameters + const TensorInfo& inputToForgetWeights + = OverrideDataType(cLayer->m_BasicParameters.m_InputToForgetWeights->GetTensorInfo(), dataType); + const TensorInfo& inputToCellWeights + = OverrideDataType(cLayer->m_BasicParameters.m_InputToCellWeights->GetTensorInfo(), dataType); + const TensorInfo& inputToOutputWeights + = OverrideDataType(cLayer->m_BasicParameters.m_InputToOutputWeights->GetTensorInfo(), dataType); + const TensorInfo& recurrentToForgetWeights + = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToForgetWeights->GetTensorInfo(), dataType); + const TensorInfo& recurrentToCellWeights + = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToCellWeights->GetTensorInfo(), dataType); + const TensorInfo& recurrentToOutputWeights + = OverrideDataType(cLayer->m_BasicParameters.m_RecurrentToOutputWeights->GetTensorInfo(), dataType); + const TensorInfo& forgetGateBias + = OverrideDataType(cLayer->m_BasicParameters.m_ForgetGateBias->GetTensorInfo(), dataType); + const TensorInfo& cellBias + = OverrideDataType(cLayer->m_BasicParameters.m_CellBias->GetTensorInfo(), dataType); + const TensorInfo& outputGateBias + = OverrideDataType(cLayer->m_BasicParameters.m_OutputGateBias->GetTensorInfo(), dataType); + + // Optional parameters + const TensorInfo* inputToInputWeights = nullptr; + const TensorInfo* recurrentToInputWeights = nullptr; + const TensorInfo* cellToInputWeights = nullptr; + const TensorInfo* inputGateBias = nullptr; + const TensorInfo* projectionWeights = nullptr; + const TensorInfo* projectionBias = nullptr; + const TensorInfo* cellToForgetWeights = nullptr; + const TensorInfo* cellToOutputWeights = nullptr; + + TensorInfo optInputToInputWeights; + TensorInfo optRecurrentToInputWeights; + TensorInfo optCellToInputWeights; + TensorInfo optInputGateBias; + TensorInfo optProjectionWeights; + TensorInfo optProjectionBias; + TensorInfo optCellToForgetWeights; + TensorInfo optCellToOutputWeights; + + if(!descriptor.m_CifgEnabled) + { + optInputToInputWeights = + OverrideDataType(cLayer->m_CifgParameters.m_InputToInputWeights->GetTensorInfo(), dataType); + inputToInputWeights = &optInputToInputWeights; + + optRecurrentToInputWeights = + OverrideDataType(cLayer->m_CifgParameters.m_RecurrentToInputWeights->GetTensorInfo(), dataType); + recurrentToInputWeights = &optRecurrentToInputWeights; + if (cLayer->m_CifgParameters.m_CellToInputWeights != nullptr) + { + optCellToInputWeights = + OverrideDataType(cLayer->m_CifgParameters.m_CellToInputWeights->GetTensorInfo(), dataType); + cellToInputWeights = &optCellToInputWeights; + } + optInputGateBias = + OverrideDataType(cLayer->m_CifgParameters.m_InputGateBias->GetTensorInfo(), dataType); + inputGateBias = &optInputGateBias; + } + + if(descriptor.m_ProjectionEnabled) + { + optProjectionWeights = + OverrideDataType(cLayer->m_ProjectionParameters.m_ProjectionWeights->GetTensorInfo(), dataType); + projectionWeights = &optProjectionWeights; + if (cLayer->m_ProjectionParameters.m_ProjectionBias != nullptr) + { + optProjectionBias = + OverrideDataType(cLayer->m_ProjectionParameters.m_ProjectionBias->GetTensorInfo(), dataType); + projectionBias = &optProjectionBias; + } + } + + if(descriptor.m_PeepholeEnabled) + { + optCellToForgetWeights = + OverrideDataType(cLayer->m_PeepholeParameters.m_CellToForgetWeights->GetTensorInfo(), dataType); + cellToForgetWeights = &optCellToForgetWeights; + optCellToOutputWeights = + OverrideDataType(cLayer->m_PeepholeParameters.m_CellToOutputWeights->GetTensorInfo(), dataType); + cellToOutputWeights = &optCellToOutputWeights; + } + + result = IsLstmSupported(compute, + input, + outputStateIn, + cellStateIn, + scratchBuffer, + outputStateOut, + cellStateOut, + output, + descriptor, + inputToForgetWeights, + inputToCellWeights, + inputToOutputWeights, + recurrentToForgetWeights, + recurrentToCellWeights, + recurrentToOutputWeights, + forgetGateBias, + cellBias, + outputGateBias, + inputToInputWeights, + recurrentToInputWeights, + cellToInputWeights, + inputGateBias, + projectionWeights, + projectionBias, + cellToForgetWeights, + cellToOutputWeights, + reason, + reasonCapacity); break; } case LayerType::Merger: { auto cLayer = boost::polymorphic_downcast<const MergerLayer*>(&layer); - // Get vector of all inputs - auto getTensorInfo = [](const InputSlot& slot) + // Get vector of all inputs. + auto getTensorInfo = [&dataType](const InputSlot& slot) { - return &slot.GetConnectedOutputSlot()->GetTensorInfo(); + return OverrideDataType(slot.GetConnectedOutputSlot()->GetTensorInfo(), dataType); }; - auto begin = boost::make_transform_iterator(layer.GetInputSlots().begin(), getTensorInfo); - auto end = boost::make_transform_iterator(layer.GetInputSlots().end(), getTensorInfo); + auto beginI = boost::make_transform_iterator(layer.GetInputSlots().begin(), getTensorInfo); + auto endI = boost::make_transform_iterator(layer.GetInputSlots().end(), getTensorInfo); + std::vector<TensorInfo> inputs(beginI, endI); - std::vector<const TensorInfo*> inputs(begin, end); + auto getTensorInfoPtr = [](const TensorInfo& info) + { + return &info; + }; + auto beginPtr = boost::make_transform_iterator(inputs.begin(), getTensorInfoPtr); + auto endPtr = boost::make_transform_iterator(inputs.end(), getTensorInfoPtr); + std::vector<const TensorInfo*> inputPtrs(beginPtr, endPtr); - result = IsMergerSupported(compute, inputs, cLayer->GetParameters(), reason, reasonCapacity); + result = IsMergerSupported(compute, inputPtrs, cLayer->GetParameters(), reason, reasonCapacity); break; } case LayerType::Multiplication: { const TensorInfo& input0 = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& input1 = layer.GetInputSlot(1).GetConnection()->GetTensorInfo(); - result = IsMultiplicationSupported(compute, input0, input1, reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsMultiplicationSupported(compute, + OverrideDataType(input0, dataType), + OverrideDataType(input1, dataType), + OverrideDataType(output, dataType), + reason, + reasonCapacity); break; } case LayerType::Normalization: @@ -185,13 +512,15 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat auto cLayer = boost::polymorphic_downcast<const NormalizationLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsNormalizationSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity); + result = IsNormalizationSupported(compute, OverrideDataType(input, dataType), + OverrideDataType(output, dataType), cLayer->GetParameters(), reason, + reasonCapacity); break; } case LayerType::Output: { const TensorInfo& output = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsOutputSupported(compute, output, reason, reasonCapacity); + result = IsOutputSupported(compute, OverrideDataType(output, dataType), reason, reasonCapacity); break; } case LayerType::Permute: @@ -199,7 +528,8 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat auto cLayer = boost::polymorphic_downcast<const PermuteLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsPermuteSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity); + result = IsPermuteSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType), + cLayer->GetParameters(), reason, reasonCapacity); break; } case LayerType::Pooling2d: @@ -207,33 +537,38 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat auto cLayer = boost::polymorphic_downcast<const Pooling2dLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); - result = IsPooling2dSupported(compute, input, output, cLayer->GetParameters(), reason, reasonCapacity); + result = IsPooling2dSupported(compute, OverrideDataType(input, dataType), + OverrideDataType(output, dataType), cLayer->GetParameters(), reason, + reasonCapacity); break; } case LayerType::Reshape: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsReshapeSupported(compute, input, reason, reasonCapacity); + result = IsReshapeSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity); break; } case LayerType::ResizeBilinear: { const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsResizeBilinearSupported(compute, input, reason, reasonCapacity); + result = IsResizeBilinearSupported(compute, OverrideDataType(input, dataType), reason, reasonCapacity); break; } case LayerType::Softmax: { auto cLayer = boost::polymorphic_downcast<const SoftmaxLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsSoftmaxSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + const TensorInfo& output = layer.GetOutputSlot(0).GetTensorInfo(); + result = IsSoftmaxSupported(compute, OverrideDataType(input, dataType), OverrideDataType(output, dataType), + cLayer->GetParameters(), reason, reasonCapacity); break; } case LayerType::Splitter: { auto cLayer = boost::polymorphic_downcast<const SplitterLayer*>(&layer); const TensorInfo& input = layer.GetInputSlot(0).GetConnection()->GetTensorInfo(); - result = IsSplitterSupported(compute, input, cLayer->GetParameters(), reason, reasonCapacity); + result = IsSplitterSupported(compute, OverrideDataType(input, dataType), cLayer->GetParameters(), reason, + reasonCapacity); break; } default: @@ -248,7 +583,8 @@ bool IWorkloadFactory::IsLayerSupported(Compute compute, const Layer& layer, Dat return result; } -bool IWorkloadFactory::IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported) +bool IWorkloadFactory::IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported) { return IsLayerSupported(layer.GetComputeDevice(), layer, dataType, outReasonIfUnsupported); } diff --git a/src/armnn/backends/WorkloadFactory.hpp b/src/armnn/backends/WorkloadFactory.hpp index 5791c1b46f..c211a290b3 100644 --- a/src/armnn/backends/WorkloadFactory.hpp +++ b/src/armnn/backends/WorkloadFactory.hpp @@ -8,13 +8,14 @@ #include <memory> #include "armnn/TensorFwd.hpp" #include "OutputHandler.hpp" +#include <boost/optional.hpp> namespace armnn { class Layer; -// Workload factory interface for compute backends +// Workload factory interface for compute backends. class IWorkloadFactory { public: @@ -25,9 +26,16 @@ public: /// Informs the memory manager that the network is finalized and ready for execution. virtual void Finalize() { } - static bool IsLayerSupported(Compute compute, const Layer& layer, DataType dataType, + /// Inform the memory manager to release the memory + virtual void Release() { } + + /// Inform the memory manager to acquire memory + virtual void Acquire() { } + + static bool IsLayerSupported(Compute compute, const Layer& layer, boost::optional<DataType> dataType, + std::string& outReasonIfUnsupported); + static bool IsLayerSupported(const Layer& layer, boost::optional<DataType> dataType, std::string& outReasonIfUnsupported); - static bool IsLayerSupported(const Layer& layer, DataType dataType, std::string& outReasonIfUnsupported); virtual bool SupportsSubTensors() const = 0; @@ -103,6 +111,15 @@ public: virtual std::unique_ptr<IWorkload> CreateFloor(const FloorQueueDescriptor& descriptor, const WorkloadInfo& info) const = 0; + + virtual std::unique_ptr<IWorkload> CreateLstm(const LstmQueueDescriptor& descriptor, + const WorkloadInfo& info) const = 0; + + virtual std::unique_ptr<IWorkload> CreateConvertFp16ToFp32(const ConvertFp16ToFp32QueueDescriptor& descriptor, + const WorkloadInfo& info) const = 0; + + virtual std::unique_ptr<IWorkload> CreateConvertFp32ToFp16(const ConvertFp32ToFp16QueueDescriptor& descriptor, + const WorkloadInfo& info) const = 0; }; } //namespace armnn diff --git a/src/armnn/backends/WorkloadUtils.hpp b/src/armnn/backends/WorkloadUtils.hpp new file mode 100644 index 0000000000..f21c78558e --- /dev/null +++ b/src/armnn/backends/WorkloadUtils.hpp @@ -0,0 +1,139 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "armnn/Tensor.hpp" +#include "ITensorHandle.hpp" + +#include <boost/cast.hpp> + +namespace armnn +{ +namespace +{ +template<typename ArrayType, typename Arg> +void AssignValues(unsigned int num, unsigned int& idx, const ArrayType& array, Arg& arg) +{ + if (idx >= num) + { + return; + } + + arg = array[(num - 1) - idx]; + idx++; +}; + +template<typename T, typename ArrayType, typename ...Args> +void AssignValues(unsigned int num, unsigned int idx, const ArrayType& array, T& assignee, Args& ... args) +{ + AssignValues(num, idx, array, assignee); + + AssignValues(num, idx, array, args...); +} +} // namespace + +template<typename CopyFunc> +void CopyTensorContentsGeneric(const ITensorHandle* srcTensor, ITensorHandle* dstTensor, CopyFunc copy) +{ + static_assert(MaxNumOfTensorDimensions == 4, "Please update CopyTensorContents"); + + TensorShape srcStrides = srcTensor->GetStrides(); + const TensorShape& srcShape = srcTensor->GetShape(); + TensorShape dstStrides = dstTensor->GetStrides(); + const TensorShape& dstShape = dstTensor->GetShape(); + + size_t srcBatches = 1; + size_t srcChannels = 1; + size_t srcHeight = 1; + size_t srcWidth = 1; + AssignValues(srcShape.GetNumDimensions(),0, srcShape, + srcWidth, + srcHeight, + srcChannels, + srcBatches); + + size_t srcBatchStride = 0; + size_t srcChannelStride = 0; + size_t srcHeightStride = 0; + size_t srcWidthStride = 0; + AssignValues(srcStrides.GetNumDimensions(),0, srcStrides, + srcWidthStride, + srcHeightStride, + srcChannelStride, + srcBatchStride); + + size_t dstBatches = 1; + size_t dstChannels = 1; + size_t dstHeight = 1; + size_t dstWidth = 1; + AssignValues(dstShape.GetNumDimensions(),0, dstShape, + dstWidth, + dstHeight, + dstChannels, + dstBatches); + + size_t dstBatchStride = 0; + size_t dstChannelStride = 0; + size_t dstHeightStride = 0; + size_t dstWidthStride = 0; + AssignValues(dstStrides.GetNumDimensions(),0, dstStrides, + dstWidthStride, + dstHeightStride, + dstChannelStride, + dstBatchStride); + + auto srcData = static_cast<const uint8_t*>(srcTensor->Map()); + auto dstData = static_cast<uint8_t*>(dstTensor->Map()); + + size_t copyLength = std::min(srcWidth*srcWidthStride, dstWidth*dstWidthStride); + size_t copyHeight = std::min(srcHeight, dstHeight); + size_t copyChannels = std::min(srcChannels, dstChannels); + size_t copyBatches = std::min(srcBatches, dstBatches); + + for(unsigned int b=0; b < copyBatches; ++b) + { + auto srcPtrBatch = srcData; + auto dstPtrBatch = dstData; + for (unsigned int c=0; c< copyChannels; ++c) + { + auto srcPtrChannel = srcData; + auto dstPtrChannel = dstData; + for (unsigned int h=0; h < copyHeight; ++h) + { + copy(dstData, srcData, copyLength); + dstData += dstHeightStride; + srcData += srcHeightStride; + } + dstData += (static_cast<long>(dstChannelStride) - (dstData - dstPtrChannel)); + srcData += (static_cast<long>(srcChannelStride) - (srcData - srcPtrChannel)); + } + dstData += (static_cast<long>(dstBatchStride)-(dstData - dstPtrBatch)); + srcData += (static_cast<long>(srcBatchStride)-(srcData - srcPtrBatch)); + } + + srcTensor->Unmap(); + dstTensor->Unmap(); +} + +template <typename SrcTensorHandleType, typename DstTensorHandleType, typename DescriptorType> +void GatherTensorHandlePairs(const DescriptorType& descriptor, + std::vector<std::pair<SrcTensorHandleType*, DstTensorHandleType*>>& tensorHandlePairs) +{ + const unsigned int numInputs = static_cast<unsigned int>(descriptor.m_Inputs.size()); + tensorHandlePairs.reserve(numInputs); + + for (unsigned int i = 0; i < numInputs; ++i) + { + SrcTensorHandleType* const srcTensorHandle = boost::polymorphic_downcast<SrcTensorHandleType*>( + descriptor.m_Inputs[i]); + DstTensorHandleType* const dstTensorHandle = boost::polymorphic_downcast<DstTensorHandleType*>( + descriptor.m_Outputs[i]); + + tensorHandlePairs.emplace_back(srcTensorHandle, dstTensorHandle); + } +} + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/backends/test/ActivationFixture.hpp b/src/armnn/backends/test/ActivationFixture.hpp index a67a110354..69f3c8be05 100644 --- a/src/armnn/backends/test/ActivationFixture.hpp +++ b/src/armnn/backends/test/ActivationFixture.hpp @@ -41,7 +41,7 @@ struct ActivationFixture armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; - // parameters used by some of the activation functions + // Parameters used by some of the activation functions. float a = 0.234f; float b = -12.345f; }; diff --git a/src/armnn/backends/test/ActivationTestImpl.hpp b/src/armnn/backends/test/ActivationTestImpl.hpp index 255a00ef0b..e699b2289b 100644 --- a/src/armnn/backends/test/ActivationTestImpl.hpp +++ b/src/armnn/backends/test/ActivationTestImpl.hpp @@ -53,7 +53,7 @@ LayerTestResult<T, 4> BoundedReLuTestCommon(armnn::IWorkloadFactory& workloadFac std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Setup bounded ReLu + // Setup bounded ReLu. armnn::ActivationQueueDescriptor descriptor; armnn::WorkloadInfo workloadInfo; AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get()); @@ -94,7 +94,7 @@ LayerTestResult<float, 4> BoundedReLuUpperAndLowerBoundTest(armnn::IWorkloadFact 0.999f, 1.2f, 0.89f, 6.1f, }; - // Calculated manually + // Calculated manually. std::vector<float> output = std::vector<float>{ -1.0f, 0.1f, 0.5f, 1.0f, 0.786f, 0.9875f, -1.0f, 0.384f, @@ -122,7 +122,7 @@ LayerTestResult<float, 4> BoundedReLuUpperBoundOnlyTest(armnn::IWorkloadFactory& 0.999f, 1.2f, 0.89f, 6.1f, }; - // Calculated manually + // Calculated manually. std::vector<float> output = std::vector<float>{ 0.0f, 0.1f, 0.5f, 6.0f, 0.786f, 5.9875f, 0.0f, 0.384f, @@ -147,7 +147,7 @@ LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperBoundOnlyTest(armnn::IWorkloadF 251, 8, 92 }; - // Calculated manually + // Calculated manually. std::vector<uint8_t> output = std::vector<uint8_t>{ 0, 122, 0, 255, 0, 58 @@ -176,7 +176,7 @@ LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperAndLowerBoundTest(armnn::IWorkl 251, 8, 92 }; - // Calculated manually + // Calculated manually. std::vector<uint8_t> output = std::vector<uint8_t>{ 51, 192, 32, 192, 32, 92 @@ -186,7 +186,7 @@ LayerTestResult<uint8_t, 4> BoundedReLuUint8UpperAndLowerBoundTest(armnn::IWorkl float inputScale = 0.0125f; return BoundedReLuTestCommon(workloadFactory, 1.0f, -1.0f, - inputScale, inputOffset, inputScale, inputOffset, // input/output scale & offset same + inputScale, inputOffset, inputScale, inputOffset, // Input/output scale & offset same. input, output, inputWidth, inputHeight, inputChannels, inputBatchSize); } @@ -229,13 +229,14 @@ boost::multi_array<float, 4> BoundedReLuRandomInputTest(armnn::IWorkloadFactory& boost::multi_array<float, 4> output(GetTensorShapeAsArray<4>(outputTensorInfo)); - // min/max random values passed to MakeRandomTensor are purposely outside of the ReLu range [lowerBound, upperBound] + // Min/max random values passed to MakeRandomTensor are purposely outside of the ReLu + // range [lowerBound, upperBound]. auto input = MakeRandomTensor<float, 4>(inputTensorInfo, 4605828, lowerBound - 5.0f, upperBound * 2.0f); std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Setup bounded ReLu + // Set up bounded ReLu. armnn::ActivationQueueDescriptor descriptor; armnn::WorkloadInfo workloadInfo; AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get()); @@ -308,7 +309,7 @@ LayerTestResult<T,4> ConstantLinearActivationTestCommon(armnn::IWorkloadFactory& std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Do linear activation that should leave tensor unchanged + // Do linear activation that should leave the tensor unchanged. armnn::ActivationQueueDescriptor data; armnn::WorkloadInfo info; AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); @@ -329,7 +330,7 @@ LayerTestResult<T,4> ConstantLinearActivationTestCommon(armnn::IWorkloadFactory& CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); - // Ensure output equals input + // Ensure output equals input. ret.outputExpected = input; return ret; @@ -386,7 +387,7 @@ LayerTestResult<T, 4> SimpleActivationTest(armnn::IWorkloadFactory& workloadFact std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); - // Setup bounded ReLu + // Setup bounded ReLu. armnn::ActivationQueueDescriptor descriptor; armnn::WorkloadInfo workloadInfo; AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get()); @@ -407,7 +408,7 @@ LayerTestResult<T, 4> SimpleActivationTest(armnn::IWorkloadFactory& workloadFact CopyDataFromITensorHandle(&result.output[0][0][0][0], outputHandle.get()); - // Calculated manually + // Calculated manually. result.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, outputExpectedData)); return result; @@ -423,7 +424,7 @@ LayerTestResult<T, 4> SimpleSigmoidTestCommon(armnn::IWorkloadFactory& workloadF 1.0f, 2.0f, 3.0f, 4.0f }; - // Calculate output values for input + // Calculate output values for input. auto f = [](float value) { return 1.0f / (1.0f + std::exp(-value)); diff --git a/src/armnn/backends/test/ArmComputeCl.cpp b/src/armnn/backends/test/ArmComputeCl.cpp index ae42d03ee3..d0cb7243c3 100644 --- a/src/armnn/backends/test/ArmComputeCl.cpp +++ b/src/armnn/backends/test/ArmComputeCl.cpp @@ -3,7 +3,6 @@ // See LICENSE file in the project root for full license information. // #include <boost/test/unit_test.hpp> - #include "test/TensorHelpers.hpp" #include "LayerTests.hpp" @@ -13,6 +12,7 @@ #include "backends/RefWorkloadFactory.hpp" #include "backends/ClLayerSupport.hpp" #include "ActivationFixture.hpp" +#include "ClContextControlFixture.hpp" #include <arm_compute/core/CL/CLKernelLibrary.h> #include <arm_compute/runtime/CL/CLScheduler.h> @@ -21,7 +21,7 @@ #include "test/UnitTests.hpp" -BOOST_AUTO_TEST_SUITE(Compute_ArmComputeCl) +BOOST_FIXTURE_TEST_SUITE(Compute_ArmComputeCl, ClContextControlFixture) using FactoryType = armnn::ClWorkloadFactory; // ============================================================================ @@ -65,27 +65,24 @@ ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dDepthMul1Uint8, DepthwiseConv ARMNN_AUTO_TEST_CASE(DepthwiseConvolution2dAsymmetric, DepthwiseConvolution2dAsymmetricTest, true) ARMNN_AUTO_TEST_CASE(UnbiasedDepthwiseConvolution2dAsymmetric, DepthwiseConvolution2dAsymmetricTest, false) -// Splitter -BOOST_AUTO_TEST_CASE(SimpleSplitter) +// Softmax +BOOST_AUTO_TEST_CASE(Softmax4dSupport) { - armnn::ClWorkloadFactory workloadFactory; - auto testResult = SplitterTest(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} + const unsigned int numDimensions = 4u; + std::array<unsigned int, numDimensions> dimensionSizes; + dimensionSizes.fill(1u); -BOOST_AUTO_TEST_CASE(SimpleSplitterUint8) -{ - armnn::ClWorkloadFactory workloadFactory; - auto testResult = SplitterUint8Test(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } + const armnn::TensorInfo inputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + const armnn::TensorInfo outputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + + // 4D Softmax should be reported as unsupported on the CL backend + BOOST_TEST(!armnn::IsSoftmaxSupportedCl(inputInfo, outputInfo, armnn::SoftmaxDescriptor())); } +// Splitter +ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest) +ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test) + ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest) ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test) @@ -209,6 +206,19 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test) +// Lstm +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32WithCifgWithPeepholeNoProjection, + LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest) +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgNoPeepholeNoProjection, + LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest) +ARMNN_AUTO_TEST_CASE(LstmLayerFloat32NoCifgWithPeepholeWithProjection, + LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest) + +// Convert from Float16 to Float32 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test) +// Convert from Float32 to Float16 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test) + // ============================================================================ // COMPARE tests diff --git a/src/armnn/backends/test/ArmComputeNeon.cpp b/src/armnn/backends/test/ArmComputeNeon.cpp index 0a78b75e2e..12947ca77a 100644 --- a/src/armnn/backends/test/ArmComputeNeon.cpp +++ b/src/armnn/backends/test/ArmComputeNeon.cpp @@ -54,7 +54,7 @@ armnn::Convolution2dDescriptor MakeConv2dDesc(uint32_t strideX, uint32_t strideY BOOST_AUTO_TEST_CASE(Conv2dUtils) { - // the only preferred Neon convolution is 1x1 with padding=0 and stride size {1,2,3} + // The only preferred Neon convolution is 1x1 with padding=0 and stride size {1,2,3}. armnn::TensorShape shape1x1({ 1,1,1,1 }); armnn::TensorInfo info1x1(shape1x1, armnn::DataType::Float32); BOOST_TEST(armnn::IsNeonDirectConvolutionPreferred(info1x1, MakeConv2dDesc(1, 1))); @@ -98,49 +98,133 @@ armnn::DepthwiseConvolution2dDescriptor MakeDepthwiseConv2dDesc(uint32_t strideX uint32_t depthMultiplier = 1, uint32_t padLeft = 0, uint32_t padRight = 0, uint32_t padTop = 0, uint32_t padBottom = 0) { + boost::ignore_unused(depthMultiplier); + armnn::DepthwiseConvolution2dDescriptor desc; + desc.m_PadLeft = padLeft; desc.m_PadRight = padRight; + desc.m_PadTop = padTop; desc.m_PadBottom = padBottom; desc.m_StrideX = strideX; desc.m_StrideY = strideY; - desc.m_BiasEnabled = true; + desc.m_BiasEnabled = false; + return desc; } +armnn::TensorInfo CreateOutputTensorInfo(const armnn::TensorInfo& inputInfo, + const armnn::TensorInfo& weightsInfo, + const armnn::DepthwiseConvolution2dDescriptor& descriptor, + armnn::DataType dataType) +{ + const armnn::TensorShape& inputShape = inputInfo.GetShape(); + const armnn::TensorShape& filterShape = weightsInfo.GetShape(); + + unsigned int inWidth = inputShape[3]; + unsigned int inHeight = inputShape[2]; + unsigned int inBatchSize = inputShape[0]; + + unsigned int filterWidth = filterShape[3]; + unsigned int readWidth = (inWidth + descriptor.m_PadLeft + descriptor.m_PadRight) - (filterWidth); + unsigned int outWidth = 1u + (readWidth / descriptor.m_StrideX); + + unsigned int filterHeight = filterShape[2]; + unsigned int readHeight = (inHeight + descriptor.m_PadTop + descriptor.m_PadBottom) - (filterHeight); + unsigned int outHeight = 1u + (readHeight / descriptor.m_StrideY); + unsigned int depthMultiplier = filterShape[0]; + + unsigned int outChannels = filterShape[1] * depthMultiplier; + unsigned int outBatchSize = inBatchSize; + + armnn::TensorShape outputShape({outBatchSize, outChannels, outHeight, outWidth}); + return armnn::TensorInfo(outputShape, dataType); +} } BOOST_AUTO_TEST_CASE(DepthwiseConv2dUtils) { - armnn::TensorInfo inputInfo({ 1, 1, 10, 10 }, armnn::DataType::Float32); - armnn::TensorInfo weightsInfo3x3({ 1, 1, 3, 3 }, armnn::DataType::Float32); + const armnn::DataType dataType = armnn::DataType::Float32; + + armnn::TensorInfo inputInfo({1, 1, 10, 10 }, dataType); + armnn::TensorInfo outputInfo; + armnn::TensorInfo weightsInfo3x3({ 1, 1, 3, 3 }, dataType); + armnn::TensorInfo biasesInfo; + + armnn::DepthwiseConvolution2dDescriptor descriptor; // Strides supported: 1,2,3 - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 2), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 3), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 1), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 2), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(2, 3), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 1), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 2), weightsInfo3x3)); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(3, 3), weightsInfo3x3)); - - // Unsupported stride - BOOST_TEST(!armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(4, 1), weightsInfo3x3)); + descriptor = MakeDepthwiseConv2dDesc(1, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(1, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(1, 3); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(2, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(2, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(2, 3); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(3, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(3, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + descriptor = MakeDepthwiseConv2dDesc(3, 3); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); + + // Supported stride 4 + descriptor = MakeDepthwiseConv2dDesc(4, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); // Supported weights shape 1x1 armnn::TensorInfo weightsInfo1x1({ 1, 1, 1, 1 }, armnn::DataType::Float32); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo1x1)); + descriptor = MakeDepthwiseConv2dDesc(1, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo1x1, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo1x1, biasesInfo)); // Supported shape 2x2 armnn::TensorInfo weightsInfo2x2({ 1, 1, 2, 2 }, armnn::DataType::Float32); - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1), weightsInfo2x2)); + descriptor = MakeDepthwiseConv2dDesc(1, 1); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo2x2, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo2x2, biasesInfo)); // Asymmetric padding - BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, MakeDepthwiseConv2dDesc(1, 1, 1, 1, 2, 1, 2), - weightsInfo3x3)); + descriptor = MakeDepthwiseConv2dDesc(1, 1, 1, 1, 2, 1, 2); + outputInfo = CreateOutputTensorInfo(inputInfo, weightsInfo3x3, descriptor, dataType); + BOOST_TEST(armnn::IsDepthwiseConvolutionSupportedNeon(inputInfo, outputInfo, descriptor, + weightsInfo3x3, biasesInfo)); } // Pooling @@ -201,27 +285,24 @@ ARMNN_AUTO_TEST_CASE(SimpleSoftmaxBeta2Uint8, SimpleSoftmaxUint8Test, 2.0f) ARMNN_AUTO_TEST_CASE(ReLu1Uint8, BoundedReLuUint8UpperAndLowerBoundTest) ARMNN_AUTO_TEST_CASE(ReLu6Uint8, BoundedReLuUint8UpperBoundOnlyTest) -// Splitter -BOOST_AUTO_TEST_CASE(SimpleSplitter) +// Softmax +BOOST_AUTO_TEST_CASE(Softmax4dSupport) { - armnn::NeonWorkloadFactory workloadFactory; - auto testResult = SplitterTest(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} + const unsigned int numDimensions = 4u; + std::array<unsigned int, numDimensions> dimensionSizes; + dimensionSizes.fill(1u); -BOOST_AUTO_TEST_CASE(SimpleSplitterUint8) -{ - armnn::NeonWorkloadFactory workloadFactory; - auto testResult = SplitterUint8Test(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } + const armnn::TensorInfo inputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + const armnn::TensorInfo outputInfo(numDimensions, &dimensionSizes.front(), armnn::DataType::Float32); + + // 4D Softmax should be reported as unsupported on the NEON backend + BOOST_TEST(!armnn::IsSoftmaxSupportedNeon(inputInfo, outputInfo, armnn::SoftmaxDescriptor())); } +// Splitter +ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest) +ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test) + ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest) ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test) @@ -375,5 +456,4 @@ ARMNN_COMPARE_REF_FIXTURE_TEST_CASE(CompareSqrtActivationWithReference, Positive ARMNN_COMPARE_REF_FIXTURE_TEST_CASE(CompareSquareActivationWithReference, ActivationFixture, CompareActivationTest, armnn::ActivationFunction::Square, 5u) - BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/BatchNormTestImpl.hpp b/src/armnn/backends/test/BatchNormTestImpl.hpp index 861ef6b053..82e6e86747 100644 --- a/src/armnn/backends/test/BatchNormTestImpl.hpp +++ b/src/armnn/backends/test/BatchNormTestImpl.hpp @@ -52,7 +52,7 @@ LayerTestResult<T,4> BatchNormTestImpl(armnn::IWorkloadFactory& workloadFactory, 4.f, 1.f, -2.f, 4.f })); - // these values are per-channel of the input + // These values are per-channel of the input. auto mean = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3, -2})); auto variance = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {4, 9})); auto beta = MakeTensor<T, 1>(tensorInfo, QuantizedVector<T>(qScale, qOffset, {3, 2})); @@ -82,8 +82,8 @@ LayerTestResult<T,4> BatchNormTestImpl(armnn::IWorkloadFactory& workloadFactory, data.m_Gamma = &gammaTensor; data.m_Parameters.m_Eps = 0.0f; - // for each channel: - // substract mean, divide by standard deviation (with an epsilon to avoid div by 0) + // For each channel: + // substract mean, divide by standard deviation (with an epsilon to avoid div by 0), // multiply by gamma and add beta ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, diff --git a/src/armnn/backends/test/ClContextControlFixture.hpp b/src/armnn/backends/test/ClContextControlFixture.hpp new file mode 100644 index 0000000000..13c061f818 --- /dev/null +++ b/src/armnn/backends/test/ClContextControlFixture.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "backends/ClContextControl.hpp" + +template<bool ProfilingEnabled> +struct ClContextControlFixtureBase +{ + // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case + ClContextControlFixtureBase() : m_ClContextControl(nullptr, ProfilingEnabled) {} + ~ClContextControlFixtureBase() {} + + armnn::ClContextControl m_ClContextControl; +}; + +using ClContextControlFixture = ClContextControlFixtureBase<false>; +using ClProfilingContextControlFixture = ClContextControlFixtureBase<true>; diff --git a/src/armnn/backends/test/Conv2dTestImpl.hpp b/src/armnn/backends/test/Conv2dTestImpl.hpp index 0c34beaa33..43297880f8 100644 --- a/src/armnn/backends/test/Conv2dTestImpl.hpp +++ b/src/armnn/backends/test/Conv2dTestImpl.hpp @@ -32,7 +32,7 @@ struct FullyConnectedBiasTypeForInputType<uint8_t> using Type = int32_t; }; -// Modifies a std::vector in-place using a specified bias +// Modifies a std::vector in-place using a specified bias. template<typename T, typename B> void ApplyBias(std::vector<T>& v, float vScale, int32_t vOffset, const std::vector<B>& bias, float bScale, int32_t bOffset, uint32_t w, uint32_t h) @@ -42,7 +42,7 @@ void ApplyBias(std::vector<T>& v, float vScale, int32_t vOffset, BOOST_ASSERT_MSG((armnn::IsQuantizedType<B>() && bScale != 0.0f) || (!armnn::IsQuantizedType<B>()), "Invalid type and parameter combination."); - // Note we need to dequantize and re-quantize the image value and the bias + // Note we need to dequantize and re-quantize the image value and the bias. for (uint32_t i = 0; i < bias.size(); ++i) { float dBias = SelectiveDequantize(bias[i], bScale, bOffset); @@ -90,15 +90,15 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl bool biasEnabled = bias.size() > 0; - // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches) + // This function currently assumes 1 batch of input/output (and duplicates this into 2 batches). BOOST_ASSERT(inputNum == 1); BOOST_ASSERT(outputNum == 1); - // If a bias is used, its size must equal the number of output channels + // If a bias is used, its size must equal the number of output channels. BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels); - // Note these tensors will use two (identical) batches + // Note these tensors will use two (identical) batches. armnn::TensorInfo inputTensorInfo({2*inputNum, inputChannels, inputHeight, inputWidth}, armnn::GetDataType<T>()); armnn::TensorInfo outputTensorInfo({2*outputNum, outputChannels, outputHeight, outputWidth}, armnn::GetDataType<T>()); @@ -120,7 +120,7 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl LayerTestResult<T, 4> ret(outputTensorInfo); - // Construct input data - Two batches of the same input image + // Construct input data - two batches of the same input image. std::vector<T> inputImage; inputImage.assign(input.data(), input.data() + 1*inputChannels*inputHeight*inputWidth); std::vector<T> inputData; @@ -131,7 +131,7 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl std::vector<T> outputImage; outputImage.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth); - // Apply bias to output image if enabled + // Apply bias to output image if it is enabled. if(biasEnabled) { std::vector<T> biasV; @@ -141,14 +141,14 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl outputWidth, outputHeight); } - // Construct expected output data - two identical images + // Construct expected output data - two identical images. std::vector<T> outputData; outputData.insert(outputData.end(), outputImage.begin(), outputImage.end()); outputData.insert(outputData.end(), outputImage.begin(), outputImage.end()); ret.outputExpected = MakeTensor<T, 4>(outputTensorInfo, outputData); - // todo: nontrivial padding and strides + // Todo: nontrivial padding and strides. uint32_t strideX = 1; uint32_t strideY = 1; @@ -171,7 +171,7 @@ LayerTestResult<T, 4> SimpleConvolution2dTestImpl(armnn::IWorkloadFactory& workl AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled - can be a source of bugs + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - can be a source of bugs. data.m_Parameters.m_StrideX = strideX; data.m_Parameters.m_StrideY = strideY; data.m_Parameters.m_PadLeft = padLeft; @@ -222,11 +222,11 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF unsigned int outputHeight = boost::numeric_cast<unsigned int>(outputExpected.shape()[2]); unsigned int outputWidth = boost::numeric_cast<unsigned int>(outputExpected.shape()[3]); - // If a bias is used, its size must equal the number of output channels + // If a bias is used, its size must equal the number of output channels. bool biasEnabled = bias.size() > 0; BOOST_ASSERT(!biasEnabled || bias.size() == outputChannels); - // create the tensors + // Creates the tensors. armnn::TensorInfo inputTensorInfo({inputNum, inputChannels, inputHeight, inputWidth}, armnn::GetDataType<T>()); armnn::TensorInfo outputTensorInfo({outputNum, outputChannels, outputHeight, outputWidth}, armnn::GetDataType<T>()); @@ -246,12 +246,12 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF biasDesc.SetQuantizationOffset(0); } - // Construct the input data + // Construct the input data. std::vector<T> inputData; inputData.assign(input.data(), input.data() + inputChannels*inputHeight*inputWidth); auto batchedInput = MakeTensor<T, 4>(inputTensorInfo, inputData); - // Construct the output data, with bias applied, as appropriate + // Construct the output data, with bias applied, as appropriate. std::vector<T> outputData; outputData.assign(outputExpected.data(), outputExpected.data() + outputChannels*outputHeight*outputWidth); if (biasEnabled) @@ -280,7 +280,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestImpl(armnn::IWorkloadF armnn::DepthwiseConvolution2dQueueDescriptor data; data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled - can be a source of bugs + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled - it can be a source of bugs. data.m_Parameters.m_StrideX = strideX; data.m_Parameters.m_StrideY = strideY; data.m_Parameters.m_PadLeft = padLeft; @@ -372,14 +372,14 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(armnn::IWorkloadFa -1.f, 0.f, -1.f, }))); - // manually calculated + // Manually calculated. std::vector<T> outputImage( QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), {0.f, 0.f}) ); - // Optionally apply bias to output image + // Optionally apply bias to output image. if(biasEnabled) { ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), @@ -405,7 +405,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dDepthMul1TestImpl(armnn::IWorkloadFa AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled. data.m_Parameters.m_StrideX = 1; data.m_Parameters.m_StrideY = 1; data.m_Parameters.m_PadLeft = 0; @@ -520,7 +520,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo 0, 0, 0 }))); - // manually calculated + // Manually calculated. std::vector<T> outputImage = std::vector<T>( QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), { 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, 3.5f, @@ -552,7 +552,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f })); - // Optionally apply bias to output image + // Optionally apply bias to output image. if(biasEnabled) { ApplyBias(outputImage, outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), @@ -578,7 +578,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dTestImpl(armnn::IWorkloadFactory& wo AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); data.m_Weight = &weightsTensor; - data.m_Bias = &biasTensor; // still set this whether or not bias is enabled + data.m_Bias = &biasTensor; // Still set this whether or not bias is enabled. data.m_Parameters.m_StrideX = 2; data.m_Parameters.m_StrideY = 1; data.m_Parameters.m_PadLeft = 0; @@ -609,7 +609,7 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact { using B = typename FullyConnectedBiasTypeForInputType<T>::Type; - // until we have a specialist 1D convolution layer, we can fake one using + // Until we have a specialist 1D convolution layer, we can fake one using // 2D convolution with the final dimension set to 1. // I don't anticipate this being particularly slow, given that convolution is implemented // as a matrix multiplication, at which point dimension doesn't matter. @@ -617,11 +617,11 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact unsigned int batchSize = 1; unsigned int inputChannels = 2; unsigned int outputChannels = 3; - unsigned int inputSize = 5; // the 1D size (could view as 'width' or 'height') + unsigned int inputSize = 5; // The 1D size (could view as 'width' or 'height'). unsigned int kernelSize = 3; unsigned int padSize = 2; unsigned int stride = 1; - unsigned int outputSize = 7; // (inputSize + 2 * padSize - kernelSize + 1) / stride + unsigned int outputSize = 7; // (inputSize + 2 * padSize - kernelSize + 1) / stride. armnn::TensorInfo inputInfo({batchSize, inputChannels, inputSize, 1}, armnn::GetDataType<T>()); armnn::TensorInfo outputInfo({batchSize, outputChannels, outputSize, 1}, armnn::GetDataType<T>()); @@ -671,7 +671,7 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact 2.5f, -1.0f + 3.0f, 1.25f - 3.2f + 2.5f, -1.0f - 5.0f, 1.25f + 0.5f - 2.0f, -3.0f, 0.5f })); - // Optionally apply bias to output image + // Optionally apply bias to output image. if(biasEnabled) { ApplyBias(outputData, outputInfo.GetQuantizationScale(), outputInfo.GetQuantizationOffset(), @@ -712,7 +712,7 @@ LayerTestResult<T,4> Convolution1dTestImpl(armnn::IWorkloadFactory& workloadFact workloadFactory.Finalize(); workload->Execute(); - // output + // Output LayerTestResult<T,4> ret(outputInfo); CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); ret.outputExpected = MakeTensor<T, 4>(outputInfo, outputData); diff --git a/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp b/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp new file mode 100644 index 0000000000..89faaf9fe6 --- /dev/null +++ b/src/armnn/backends/test/ConvertFp16ToFp32TestImpl.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <armnn/ArmNN.hpp> +#include <armnn/Tensor.hpp> +#include <armnn/TypesUtils.hpp> + +#include <backends/WorkloadInfo.hpp> +#include <backends/CpuTensorHandle.hpp> + +#include <test/TensorHelpers.hpp> + +#include <Half.hpp> + +LayerTestResult<float, 4> SimpleConvertFp16ToFp32Test(armnn::IWorkloadFactory& workloadFactory) +{ + using namespace half_float::literal; + + const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + + auto input = MakeTensor<armnn::Half, 4>(inputTensorInfo, + { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h, + 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h }); + + LayerTestResult<float, 4> ret(outputTensorInfo); + ret.outputExpected = MakeTensor<float, 4>(outputTensorInfo, + { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f, + 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f }); + + std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::ConvertFp16ToFp32QueueDescriptor data; + armnn::WorkloadInfo info; + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertFp16ToFp32(data, info); + + inputHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]); + + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); + + return ret; +} diff --git a/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp b/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp new file mode 100644 index 0000000000..1d9bee577c --- /dev/null +++ b/src/armnn/backends/test/ConvertFp32ToFp16TestImpl.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <armnn/ArmNN.hpp> +#include <armnn/Tensor.hpp> +#include <armnn/TypesUtils.hpp> + +#include <backends/WorkloadInfo.hpp> +#include <backends/CpuTensorHandle.hpp> + +#include <test/TensorHelpers.hpp> + +#include <Half.hpp> + +LayerTestResult<armnn::Half, 4> SimpleConvertFp32ToFp16Test(armnn::IWorkloadFactory& workloadFactory) +{ + using namespace half_float::literal; + + const armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + const armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + + auto input = MakeTensor<float, 4>(inputTensorInfo, + { -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f, + 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f }); + + LayerTestResult<armnn::Half, 4> ret(outputTensorInfo); + ret.outputExpected = MakeTensor<armnn::Half, 4>(outputTensorInfo, + { -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h, + 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h }); + + std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::ConvertFp32ToFp16QueueDescriptor data; + armnn::WorkloadInfo info; + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateConvertFp32ToFp16(data, info); + + inputHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]); + + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0][0][0], outputHandle.get()); + + return ret; +}
\ No newline at end of file diff --git a/src/armnn/backends/test/CreateWorkloadCl.cpp b/src/armnn/backends/test/CreateWorkloadCl.cpp index f83bb12bbe..5d4265911f 100644 --- a/src/armnn/backends/test/CreateWorkloadCl.cpp +++ b/src/armnn/backends/test/CreateWorkloadCl.cpp @@ -8,6 +8,7 @@ #include "backends/ClWorkloadUtils.hpp" #include "backends/ClWorkloads.hpp" #include "backends/ClTensorHandle.hpp" +#include "ClContextControlFixture.hpp" #include "test/CreateWorkloadClNeon.hpp" @@ -17,16 +18,17 @@ boost::test_tools::predicate_result CompareIClTensorHandleShape(IClTensorHandle* return CompareTensorHandleShape<IClTensorHandle>(tensorHandle, expectedDimensions); } -BOOST_AUTO_TEST_SUITE(CreateWorkloadCl) +BOOST_FIXTURE_TEST_SUITE(CreateWorkloadCl, ClContextControlFixture) -BOOST_AUTO_TEST_CASE(CreateActivationWorkload) +template <typename ActivationWorkloadType, armnn::DataType DataType> +static void ClCreateActivationWorkloadTest() { Graph graph; ClWorkloadFactory factory; - auto workload = CreateActivationWorkloadTest<ClActivationFloat32Workload>(factory, graph); + auto workload = CreateActivationWorkloadTest<ActivationWorkloadType, DataType>(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest). ActivationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -35,14 +37,24 @@ BOOST_AUTO_TEST_CASE(CreateActivationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {1})); } -BOOST_AUTO_TEST_CASE(CreateAdditionWorkload) +BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload) +{ + ClCreateActivationWorkloadTest<ClActivationFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateActivationFloat16Workload) +{ + ClCreateActivationWorkloadTest<ClActivationFloat32Workload, armnn::DataType::Float16>(); +} + +template <typename AdditionWorkloadType, armnn::DataType DataType> +static void ClCreateAdditionWorkloadTest() { Graph graph; ClWorkloadFactory factory; + auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType, DataType>(factory, graph); - auto workload = CreateAdditionWorkloadTest<ClAdditionFloat32Workload>(factory, graph); - - // check that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest). AdditionQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[1]); @@ -52,14 +64,26 @@ BOOST_AUTO_TEST_CASE(CreateAdditionWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3})); } -BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) +BOOST_AUTO_TEST_CASE(CreateAdditionFloat32Workload) { - Graph graph; + ClCreateAdditionWorkloadTest<ClAdditionFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateAdditionFloat16Workload) +{ + ClCreateAdditionWorkloadTest<ClAdditionFloat32Workload, armnn::DataType::Float16>(); +} + +template <typename BatchNormalizationWorkloadType, armnn::DataType DataType> +static void ClCreateBatchNormalizationWorkloadTest() +{ + Graph graph; ClWorkloadFactory factory; - auto workload = CreateBatchNormalizationWorkloadTest<ClBatchNormalizationFloat32Workload>(factory, graph); + auto workload = CreateBatchNormalizationWorkloadTest<BatchNormalizationWorkloadType, DataType> + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest). BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -68,14 +92,57 @@ BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3, 1, 1})); } -template <typename Convolution2dWorkloadType> -static void Convolution2dWorkloadTest() +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat32Workload) +{ + ClCreateBatchNormalizationWorkloadTest<ClBatchNormalizationFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat16Workload) +{ + ClCreateBatchNormalizationWorkloadTest<ClBatchNormalizationFloat32Workload, armnn::DataType::Float16>(); +} + +BOOST_AUTO_TEST_CASE(CreateConvertFp16ToFp32Workload) +{ + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateConvertFp16ToFp32WorkloadTest<ClConvertFp16ToFp32Workload>(factory, graph); + + ConvertFp16ToFp32QueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); + + BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {3, 2, 3})); + BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 3})); + BOOST_TEST((inputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F16)); + BOOST_TEST((outputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F32)); +} + +BOOST_AUTO_TEST_CASE(CreateConvertFp32ToFp16Workload) +{ + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateConvertFp32ToFp16WorkloadTest<ClConvertFp32ToFp16Workload>(factory, graph); + + ConvertFp32ToFp16QueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); + + BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {3, 2, 3})); + BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 3})); + BOOST_TEST((inputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F32)); + BOOST_TEST((outputHandle->GetTensor().info()->data_type() == arm_compute::DataType::F16)); +} + +template <typename Convolution2dWorkloadType, typename armnn::DataType DataType> +static void ClConvolution2dWorkloadTest() { - Graph graph; - ClWorkloadFactory factory; - auto workload = CreateConvolution2dWorkloadTest<Convolution2dWorkloadType>(factory, graph); + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateConvolution2dWorkloadTest<Convolution2dWorkloadType, DataType> + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -85,18 +152,24 @@ static void Convolution2dWorkloadTest() BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat32Workload) { - Convolution2dWorkloadTest<ClConvolution2dFloat32Workload>(); + ClConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float32>(); } +BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat16Workload) +{ + ClConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float16>(); +} -template <typename Convolution2dWorkloadType> -static void DirectConvolution2dWorkloadTest() + +template <typename Convolution2dWorkloadType, typename armnn::DataType DataType> +static void ClDirectConvolution2dWorkloadTest() { - Graph graph; - ClWorkloadFactory factory; - auto workload = CreateDirectConvolution2dWorkloadTest<Convolution2dWorkloadType>(factory, graph); + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateDirectConvolution2dWorkloadTest<Convolution2dWorkloadType, DataType>( + factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateDirectConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateDirectConvolution2dWorkloadTest). Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -106,22 +179,28 @@ static void DirectConvolution2dWorkloadTest() BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dFloat32Workload) { - DirectConvolution2dWorkloadTest<ClConvolution2dFloat32Workload>(); + ClDirectConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dFloat16Workload) +{ + ClDirectConvolution2dWorkloadTest<ClConvolution2dFloat32Workload, armnn::DataType::Float16>(); } BOOST_AUTO_TEST_CASE(CreateDirectConvolution2dUint8Workload) { - DirectConvolution2dWorkloadTest<ClConvolution2dUint8Workload>(); + ClDirectConvolution2dWorkloadTest<ClConvolution2dUint8Workload, armnn::DataType::QuantisedAsymm8>(); } -BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload) +template <typename FullyConnectedWorkloadType, typename armnn::DataType DataType> +static void ClCreateFullyConnectedWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = - CreateFullyConnectedWorkloadTest<ClFullyConnectedFloat32Workload>(factory, graph); + auto workload = + CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest). FullyConnectedQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -129,15 +208,28 @@ BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 7})); } -BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload) + +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32WorkloadTest) { - Graph graph; + ClCreateFullyConnectedWorkloadTest<ClFullyConnectedFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat16WorkloadTest) +{ + ClCreateFullyConnectedWorkloadTest<ClFullyConnectedFloat32Workload, armnn::DataType::Float16>(); +} + + +template <typename MultiplicationWorkloadType, typename armnn::DataType DataType> +static void ClCreateMultiplicationWorkloadTest() +{ + Graph graph; ClWorkloadFactory factory; auto workload = - CreateMultiplicationWorkloadTest<ClMultiplicationFloat32Workload>(factory, graph); + CreateMultiplicationWorkloadTest<MultiplicationWorkloadType, DataType>(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest). MultiplicationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[1]); @@ -147,14 +239,26 @@ BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {2, 3})); } -BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat32WorkloadTest) +{ + ClCreateMultiplicationWorkloadTest<ClMultiplicationFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat16WorkloadTest) +{ + ClCreateMultiplicationWorkloadTest<ClMultiplicationFloat32Workload, armnn::DataType::Float16>(); +} + +template <typename NormalizationWorkloadType, typename armnn::DataType DataType> +static void ClNormalizationWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateNormalizationWorkloadTest<ClNormalizationFloat32Workload>(factory, graph); + auto workload = CreateNormalizationWorkloadTest<NormalizationWorkloadType, DataType> + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest). NormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -163,14 +267,25 @@ BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 5, 5, 1})); } -BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload) +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat32Workload) { - Graph graph; + ClNormalizationWorkloadTest<ClNormalizationFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat16Workload) +{ + ClNormalizationWorkloadTest<ClNormalizationFloat32Workload, armnn::DataType::Float16>(); +} + +template <typename Pooling2dWorkloadType, typename armnn::DataType DataType> +static void ClPooling2dWorkloadTest() +{ + Graph graph; ClWorkloadFactory factory; - auto workload = CreatePooling2dWorkloadTest<ClPooling2dFloat32Workload>(factory, graph); + auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType, DataType>(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreatePooling2dWorkloadTest) + // Check that inputs/outputs are as we expect them (see definition of CreatePooling2dWorkloadTest). Pooling2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -179,18 +294,28 @@ BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {3, 2, 2, 4})); } -template <typename ReshapeWorkloadType> +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload) +{ + ClPooling2dWorkloadTest<ClPooling2dFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat16Workload) +{ + ClPooling2dWorkloadTest<ClPooling2dFloat32Workload, armnn::DataType::Float16>(); +} + +template <typename ReshapeWorkloadType, typename armnn::DataType DataType> static void ClCreateReshapeWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType>(factory, graph); + auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest). ReshapeQueueDescriptor queueDescriptor = workload->GetData(); - auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); - auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); + auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {4, 1})); BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {4})); // Leading size 1 dimensions are collapsed by ACL. @@ -198,38 +323,56 @@ static void ClCreateReshapeWorkloadTest() BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload) { - ClCreateReshapeWorkloadTest<ClReshapeFloat32Workload>(); + ClCreateReshapeWorkloadTest<ClReshapeFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateReshapeFloat16Workload) +{ + ClCreateReshapeWorkloadTest<ClReshapeFloat32Workload, armnn::DataType::Float16>(); } BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload) { - ClCreateReshapeWorkloadTest<ClReshapeUint8Workload>(); + ClCreateReshapeWorkloadTest<ClReshapeUint8Workload, armnn::DataType::QuantisedAsymm8>(); } -BOOST_AUTO_TEST_CASE(CreateSoftmaxWorkload) +template <typename SoftmaxWorkloadType, typename armnn::DataType DataType> +static void ClSoftmaxWorkloadTest() { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateSoftmaxWorkloadTest<ClSoftmaxFloat32Workload>(factory, graph); + auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType, DataType>(factory, graph); - // check that inputs/outputs are as we expect them (see definition of ClSoftmaxFloat32Workload) + // Checks that inputs/outputs are as we expect them (see definition of ClSoftmaxFloat32Workload). SoftmaxQueueDescriptor queueDescriptor = workload->GetData(); - auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); - auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); + auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {4, 1})); BOOST_TEST(CompareIClTensorHandleShape(outputHandle, {4, 1})); } -BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) + +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32WorkloadTest) +{ + ClSoftmaxWorkloadTest<ClSoftmaxFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat16WorkloadTest) +{ + ClSoftmaxWorkloadTest<ClSoftmaxFloat32Workload, armnn::DataType::Float16>(); +} + +template <typename SplitterWorkloadType, typename armnn::DataType DataType> +static void ClSplitterWorkloadTest() { Graph graph; ClWorkloadFactory factory; - auto workload = CreateSplitterWorkloadTest<ClSplitterFloat32Workload>(factory, graph); + auto workload = CreateSplitterWorkloadTest<SplitterWorkloadType, DataType>(factory, graph); - // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest). SplitterQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); BOOST_TEST(CompareIClTensorHandleShape(inputHandle, {5, 7, 7})); @@ -242,14 +385,25 @@ BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) auto outputHandle0 = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); // NOTE: At the moment the CL collapses the tensor to a 2 dim when dimension zero = 1 - // we are raising this difference between the NEON and CL libs as an issue with the compute library team + // we are raising this difference between the NEON and CL libs as an issue with the compute library team. BOOST_TEST(CompareIClTensorHandleShape(outputHandle0, {7, 7})); } -BOOST_AUTO_TEST_CASE(CreateSplitterMerger) +BOOST_AUTO_TEST_CASE(CreateSplitterFloat32Workload) +{ + ClSplitterWorkloadTest<ClSplitterFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateSplitterFloat16Workload) { - // Test that it is possible to decide which output of the splitter layer - // should be lined to which input of the merger layer + ClSplitterWorkloadTest<ClSplitterFloat32Workload, armnn::DataType::Float16>(); +} + +template <typename SplitterWorkloadType, typename MergerWorkloadType, typename armnn::DataType DataType> +static void ClSplitterMergerTest() +{ + // Tests that it is possible to decide which output of the splitter layer + // should be lined to which input of the merger layer. // We test that is is possible to specify 0th output // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input // of the merger. @@ -258,12 +412,13 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger) ClWorkloadFactory factory; auto workloads = - CreateSplitterMergerWorkloadTest<ClSplitterFloat32Workload, ClMergerFloat32Workload>(factory, graph); + CreateSplitterMergerWorkloadTest<SplitterWorkloadType, MergerWorkloadType, DataType> + (factory, graph); auto wlSplitter = std::move(workloads.first); auto wlMerger = std::move(workloads.second); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::ClSubTensorHandle* sOut0 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[0]); armnn::ClSubTensorHandle* sOut1 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[1]); armnn::ClSubTensorHandle* mIn0 = dynamic_cast<armnn::ClSubTensorHandle*>(wlMerger->GetData().m_Inputs[0]); @@ -274,22 +429,33 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger) BOOST_TEST(mIn0); BOOST_TEST(mIn1); - //fliped order of inputs/outputs + //Fliped order of inputs/outputs. bool validDataPointers = (sOut0 == mIn1) && (sOut1 == mIn0); BOOST_TEST(validDataPointers); - //also make sure that the inputs are subtensors of one tensor and outputs are sub tensors of another tensor + //Also make sure that the inputs are subtensors of one tensor and outputs are sub tensors of another tensor. bool validSubTensorParents = (mIn0->GetTensor().parent() == mIn1->GetTensor().parent()) && (sOut0->GetTensor().parent() == sOut1->GetTensor().parent()); BOOST_TEST(validSubTensorParents); } +BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat32Workload) +{ + ClSplitterMergerTest<ClSplitterFloat32Workload, ClMergerFloat32Workload, armnn::DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat16Workload) +{ + ClSplitterMergerTest<ClSplitterFloat32Workload, ClMergerFloat32Workload, armnn::DataType::Float16>(); +} + + BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) { // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. - // We create a splitter with two outputs. That each of those outputs is used by two different activation layers + // We create a splitter with two outputs. That each of those outputs is used by two different activation layers. Graph graph; ClWorkloadFactory factory; @@ -300,9 +466,10 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) std::unique_ptr<ClActivationFloat32Workload> wlActiv1_1; CreateSplitterMultipleInputsOneOutputWorkloadTest<ClSplitterFloat32Workload, - ClActivationFloat32Workload>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); + ClActivationFloat32Workload, armnn::DataType::Float32>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, + wlActiv1_0, wlActiv1_1); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::ClSubTensorHandle* sOut0 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[0]); armnn::ClSubTensorHandle* sOut1 = dynamic_cast<armnn::ClSubTensorHandle*>(wlSplitter->GetData().m_Outputs[1]); armnn::ClSubTensorHandle* activ0_0Im = dynamic_cast<armnn::ClSubTensorHandle*>(wlActiv0_0->GetData().m_Inputs[0]); @@ -327,17 +494,18 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) BOOST_AUTO_TEST_CASE(CreateMemCopyWorkloadsCl) { ClWorkloadFactory factory; - CreateMemCopyWorkloads<CopyFromCpuToClWorkload,CopyFromClToCpuWorkload,IClTensorHandle>(factory); + CreateMemCopyWorkloads<IClTensorHandle>(factory); } BOOST_AUTO_TEST_CASE(CreateL2NormalizationWorkload) { - Graph graph; + Graph graph; ClWorkloadFactory factory; - auto workload = CreateL2NormalizationWorkloadTest<ClL2NormalizationFloat32Workload>(factory, graph); + auto workload = CreateL2NormalizationWorkloadTest<ClL2NormalizationFloat32Workload, armnn::DataType::Float32> + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateNormalizationWorkloadTest). L2NormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[0]); @@ -346,4 +514,24 @@ BOOST_AUTO_TEST_CASE(CreateL2NormalizationWorkload) BOOST_TEST(CompareIClTensorHandleShape(outputHandle, { 5, 20, 50, 67 })); } +template <typename LstmWorkloadType> +static void ClCreateLstmWorkloadTest() +{ + Graph graph; + ClWorkloadFactory factory; + auto workload = CreateLstmWorkloadTest<LstmWorkloadType>(factory, graph); + + LstmQueueDescriptor queueDescriptor = workload->GetData(); + auto inputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Inputs[0]); + auto outputHandle = boost::polymorphic_downcast<IClTensorHandle*>(queueDescriptor.m_Outputs[1]); + BOOST_TEST(CompareIClTensorHandleShape(inputHandle, { 2, 2 })); + BOOST_TEST(CompareIClTensorHandleShape(outputHandle, { 2, 4 })); +} + +BOOST_AUTO_TEST_CASE(CreateLSTMWorkloadFloat32Workload) +{ + ClCreateLstmWorkloadTest<ClLstmFloat32Workload>(); +} + + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/CreateWorkloadNeon.cpp b/src/armnn/backends/test/CreateWorkloadNeon.cpp index 4d91fbfd31..b2a444af74 100644 --- a/src/armnn/backends/test/CreateWorkloadNeon.cpp +++ b/src/armnn/backends/test/CreateWorkloadNeon.cpp @@ -50,168 +50,302 @@ bool TestNeonTensorHandleInfo(armnn::INeonTensorHandle* handle, const armnn::Ten } // namespace -BOOST_AUTO_TEST_CASE(CreateActivationWorkload) +template <typename ActivationWorkloadType, typename armnn::DataType DataType> +static void NeonCreateActivationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateActivationWorkloadTest<NeonActivationFloat32Workload>(factory, graph); + auto workload = CreateActivationWorkloadTest<ActivationWorkloadType, DataType> + (factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateActivationWorkloadTest). ActivationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({1, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({1, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 1}, DataType))); } -BOOST_AUTO_TEST_CASE(CreateAdditionWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateActivationFloat16Workload) +{ + NeonCreateActivationWorkloadTest<NeonActivationFloat32Workload, DataType::Float16>(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload) +{ + NeonCreateActivationWorkloadTest<NeonActivationFloat32Workload, DataType::Float32>(); +} + +template <typename AdditionWorkloadType, typename armnn::DataType DataType> +static void NeonCreateAdditionWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateAdditionWorkloadTest<NeonAdditionFloat32Workload>(factory, graph); + auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType, DataType>(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateAdditionWorkloadTest). AdditionQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[1]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType))); } -BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateAdditionFloat16Workload) +{ + NeonCreateAdditionWorkloadTest<NeonAdditionFloat32Workload, DataType::Float16>(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateAdditionFloat32Workload) +{ + NeonCreateAdditionWorkloadTest<NeonAdditionFloat32Workload, DataType::Float32>(); +} + +template <typename BatchNormalizationWorkloadType, typename armnn::DataType DataType> +static void NeonCreateBatchNormalizationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateBatchNormalizationWorkloadTest<NeonBatchNormalizationFloat32Workload>(factory, graph); + auto workload = CreateBatchNormalizationWorkloadTest<BatchNormalizationWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest). BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 1, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3, 1, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 1, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3, 1, 1}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat16Workload) +{ + NeonCreateBatchNormalizationWorkloadTest<NeonBatchNormalizationFloat32Workload, DataType::Float16>(); } +#endif -BOOST_AUTO_TEST_CASE(CreateConvolution2dWorkload) +BOOST_AUTO_TEST_CASE(CreateBatchNormalizationFloat32Workload) +{ + NeonCreateBatchNormalizationWorkloadTest<NeonBatchNormalizationFloat32Workload, DataType::Float32>(); +} + +template <typename Convolution2dWorkloadType, typename armnn::DataType DataType> +static void NeonCreateConvolution2dWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateConvolution2dWorkloadTest<NeonConvolution2dFloat32Workload>(factory, graph); + auto workload = CreateConvolution2dWorkloadTest<Convolution2dWorkloadType, + DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 8, 16}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 2, 2, 10}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({2, 3, 8, 16}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 2, 2, 10}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat16Workload) +{ + NeonCreateConvolution2dWorkloadTest<NeonConvolution2dFloat32Workload, DataType::Float16>(); } +#endif -BOOST_AUTO_TEST_CASE(CreateFullyConnectedWorkload) +BOOST_AUTO_TEST_CASE(CreateConvolution2dFloat32Workload) +{ + NeonCreateConvolution2dWorkloadTest<NeonConvolution2dFloat32Workload, DataType::Float32>(); +} + +template <typename FullyConnectedWorkloadType, typename armnn::DataType DataType> +static void NeonCreateFullyConnectedWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateFullyConnectedWorkloadTest<NeonFullyConnectedFloat32Workload>(factory, graph); + auto workload = CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType, + DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest). FullyConnectedQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 1, 4, 5}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 7}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 1, 4, 5}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 7}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat16Workload) +{ + NeonCreateFullyConnectedWorkloadTest<NeonFullyConnectedFloat32Workload, DataType::Float16>(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32Workload) +{ + NeonCreateFullyConnectedWorkloadTest<NeonFullyConnectedFloat32Workload, DataType::Float32>(); } -BOOST_AUTO_TEST_CASE(CreateMultiplicationWorkload) +template <typename MultiplicationWorkloadType, typename armnn::DataType DataType> +static void NeonCreateMultiplicationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateMultiplicationWorkloadTest<NeonMultiplicationFloat32Workload>(factory, graph); + auto workload = CreateMultiplicationWorkloadTest<MultiplicationWorkloadType, + DataType>(factory, graph); - // check that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest) + // Checks that inputs/outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest). MultiplicationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle1 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto inputHandle2 = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[1]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle1, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle2, TensorInfo({2, 3}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({2, 3}, DataType))); } -BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat16Workload) +{ + NeonCreateMultiplicationWorkloadTest<NeonMultiplicationFloat32Workload, DataType::Float16>(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateMultiplicationFloat32Workload) +{ + NeonCreateMultiplicationWorkloadTest<NeonMultiplicationFloat32Workload, DataType::Float32>(); +} + +template <typename NormalizationWorkloadType, typename armnn::DataType DataType> +static void NeonCreateNormalizationWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateNormalizationWorkloadTest<NeonNormalizationFloat32Workload>(factory, graph); + auto workload = CreateNormalizationWorkloadTest<NormalizationWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest). NormalizationQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 5, 5, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 5, 5, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 5, 5, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 5, 5, 1}, DataType))); } -BOOST_AUTO_TEST_CASE(CreatePooling2dWorkload) +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat16Workload) +{ + NeonCreateNormalizationWorkloadTest<NeonNormalizationFloat32Workload, DataType::Float16>(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateNormalizationFloat32Workload) +{ + NeonCreateNormalizationWorkloadTest<NeonNormalizationFloat32Workload, DataType::Float32>(); +} + +template <typename Pooling2dWorkloadType, typename armnn::DataType DataType> +static void NeonCreatePooling2dWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreatePooling2dWorkloadTest<NeonPooling2dFloat32Workload>(factory, graph); + auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType, DataType> + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest). Pooling2dQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 2, 5, 5}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 2, 2, 4}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({3, 2, 5, 5}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({3, 2, 2, 4}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat16Workload) +{ + NeonCreatePooling2dWorkloadTest<NeonPooling2dFloat32Workload, DataType::Float16>(); } +#endif -template <typename ReshapeWorkloadType> -static void NeonCreateReshapeWorkloadTest(DataType dataType) +BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload) +{ + NeonCreatePooling2dWorkloadTest<NeonPooling2dFloat32Workload, DataType::Float32>(); +} + +BOOST_AUTO_TEST_CASE(CreatePooling2dUint8Workload) +{ + NeonCreatePooling2dWorkloadTest<NeonPooling2dUint8Workload, DataType::QuantisedAsymm8>(); +} + +template <typename ReshapeWorkloadType, typename armnn::DataType DataType> +static void NeonCreateReshapeWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType>(factory, graph); + auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest). ReshapeQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, dataType))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 4}, dataType))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({1, 4}, DataType))); } +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateReshapeFloat16Workload) +{ + NeonCreateReshapeWorkloadTest<NeonReshapeFloat32Workload, DataType::Float16>(); +} +#endif + BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload) { - NeonCreateReshapeWorkloadTest<NeonReshapeFloat32Workload>(DataType::Float32); + NeonCreateReshapeWorkloadTest<NeonReshapeFloat32Workload, DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload) { - NeonCreateReshapeWorkloadTest<NeonReshapeUint8Workload>(DataType::QuantisedAsymm8); + NeonCreateReshapeWorkloadTest<NeonReshapeUint8Workload, DataType::QuantisedAsymm8>(); } -BOOST_AUTO_TEST_CASE(CreateSoftmaxWorkload) +template <typename SoftmaxWorkloadType, typename armnn::DataType DataType> +static void NeonCreateSoftmaxWorkloadTest() { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateSoftmaxWorkloadTest<NeonSoftmaxFloat32Workload>(factory, graph); + auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest). SoftmaxQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); auto outputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType::Float32))); - BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({4, 1}, DataType::Float32))); + BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({4, 1}, DataType))); + BOOST_TEST(TestNeonTensorHandleInfo(outputHandle, TensorInfo({4, 1}, DataType))); +} + +#ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat16Workload) +{ + NeonCreateSoftmaxWorkloadTest<NeonSoftmaxFloat32Workload, DataType::Float16>(); +} +#endif + +BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32Workload) +{ + NeonCreateSoftmaxWorkloadTest<NeonSoftmaxFloat32Workload, DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) { Graph graph; NeonWorkloadFactory factory; - auto workload = CreateSplitterWorkloadTest<NeonSplitterFloat32Workload>(factory, graph); + auto workload = CreateSplitterWorkloadTest<NeonSplitterFloat32Workload, DataType::Float32>(factory, graph); - // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest). SplitterQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<INeonTensorHandle*>(queueDescriptor.m_Inputs[0]); BOOST_TEST(TestNeonTensorHandleInfo(inputHandle, TensorInfo({5, 7, 7}, DataType::Float32))); @@ -228,22 +362,23 @@ BOOST_AUTO_TEST_CASE(CreateSplitterWorkload) BOOST_AUTO_TEST_CASE(CreateSplitterMerger) { - // Test that it is possible to decide which output of the splitter layer - // should be lined to which input of the merger layer - // We test that is is possible to specify 0th output - // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input + // Tests that it is possible to decide which output of the splitter layer + // should be lined to which input of the merger layer. + // We tested that is is possible to specify 0th output + // of the splitter to be the 1st input to the merger, and the 1st output of the splitter to be 0th input // of the merger. Graph graph; NeonWorkloadFactory factory; auto workloads = - CreateSplitterMergerWorkloadTest<NeonSplitterFloat32Workload, NeonMergerFloat32Workload>(factory, graph); + CreateSplitterMergerWorkloadTest<NeonSplitterFloat32Workload, NeonMergerFloat32Workload, + DataType::Float32>(factory, graph); auto wlSplitter = std::move(workloads.first); auto wlMerger = std::move(workloads.second); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::INeonTensorHandle* sOut0 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[0]); armnn::INeonTensorHandle* sOut1 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[1]); armnn::INeonTensorHandle* mIn0 = dynamic_cast<armnn::INeonTensorHandle*>(wlMerger->GetData().m_Inputs[0]); @@ -261,8 +396,8 @@ BOOST_AUTO_TEST_CASE(CreateSplitterMerger) BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) { - // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. - // We create a splitter with two outputs. That each of those outputs is used by two different activation layers + // Tests that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. + // We created a splitter with two outputs. That each of those outputs is used by two different activation layers Graph graph; NeonWorkloadFactory factory; @@ -273,7 +408,8 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) std::unique_ptr<NeonActivationFloat32Workload> wlActiv1_1; CreateSplitterMultipleInputsOneOutputWorkloadTest<NeonSplitterFloat32Workload, - NeonActivationFloat32Workload>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); + NeonActivationFloat32Workload, DataType::Float32>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, + wlActiv1_0, wlActiv1_1); armnn::INeonTensorHandle* sOut0 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[0]); armnn::INeonTensorHandle* sOut1 = dynamic_cast<armnn::INeonTensorHandle*>(wlSplitter->GetData().m_Outputs[1]); @@ -299,7 +435,7 @@ BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputs) BOOST_AUTO_TEST_CASE(CreateMemCopyWorkloadsNeon) { NeonWorkloadFactory factory; - CreateMemCopyWorkloads<CopyFromCpuToNeonWorkload,CopyFromNeonToCpuWorkload,INeonTensorHandle>(factory); + CreateMemCopyWorkloads<INeonTensorHandle>(factory); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/CreateWorkloadRef.cpp b/src/armnn/backends/test/CreateWorkloadRef.cpp index abc46e4361..109156468a 100644 --- a/src/armnn/backends/test/CreateWorkloadRef.cpp +++ b/src/armnn/backends/test/CreateWorkloadRef.cpp @@ -39,71 +39,95 @@ void CheckInputsOutput(std::unique_ptr<Workload> workload, BOOST_AUTO_TEST_SUITE(CreateWorkloadRef) -template <typename ActivationWorkloadType> +template <typename ActivationWorkloadType, armnn::DataType DataType> static void RefCreateActivationWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateActivationWorkloadTest<ActivationWorkloadType>(factory, graph); + auto workload = CreateActivationWorkloadTest<ActivationWorkloadType, DataType>(factory, graph); - // check that outputs are as we expect them (see definition of CreateActivationWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateActivationWorkloadTest). CheckInputOutput(std::move(workload), - TensorInfo({ 1, 1 }, ActivationWorkloadType::ms_DataType), - TensorInfo({ 1, 1 }, ActivationWorkloadType::ms_DataType)); + TensorInfo({ 1, 1 }, DataType), + TensorInfo({ 1, 1 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateActivationFloat32Workload) { - RefCreateActivationWorkloadTest<RefActivationFloat32Workload>(); + RefCreateActivationWorkloadTest<RefActivationFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateActivationUint8Workload) { - RefCreateActivationWorkloadTest<RefActivationUint8Workload>(); + RefCreateActivationWorkloadTest<RefActivationUint8Workload, armnn::DataType::QuantisedAsymm8>(); } -template <typename AdditionWorkloadType> +template <typename AdditionWorkloadType, armnn::DataType DataType> static void RefCreateAdditionWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType>(factory, graph); + auto workload = CreateAdditionWorkloadTest<AdditionWorkloadType, DataType>(factory, graph); - // check that outputs are as we expect them (see definition of CreateAdditionWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateAdditionWorkloadTest). CheckInputsOutput(std::move(workload), - TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, AdditionWorkloadType::ms_DataType)); + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateAdditionFloatWorkload) { - RefCreateAdditionWorkloadTest<RefAdditionFloat32Workload>(); + RefCreateAdditionWorkloadTest<RefAdditionFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateAdditionUint8Workload) { - RefCreateAdditionWorkloadTest<RefAdditionUint8Workload>(); + RefCreateAdditionWorkloadTest<RefAdditionUint8Workload, armnn::DataType::QuantisedAsymm8>(); } BOOST_AUTO_TEST_CASE(CreateBatchNormalizationWorkload) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateBatchNormalizationWorkloadTest<RefBatchNormalizationFloat32Workload>(factory, graph); + auto workload = CreateBatchNormalizationWorkloadTest<RefBatchNormalizationFloat32Workload, armnn::DataType::Float32> + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateBatchNormalizationWorkloadTest). CheckInputOutput( std::move(workload), TensorInfo({2, 3, 1, 1}, DataType::Float32), TensorInfo({2, 3, 1, 1}, DataType::Float32)); } +BOOST_AUTO_TEST_CASE(CreateConvertFp16ToFp32Float32Workload) +{ + Graph graph; + RefWorkloadFactory factory; + auto workload = CreateConvertFp16ToFp32WorkloadTest<RefConvertFp16ToFp32Workload>(factory, graph); + + // Checks that outputs and inputs are as we expect them + CheckInputOutput( + std::move(workload), TensorInfo({1, 3, 2, 3}, DataType::Float16), TensorInfo({1, 3, 2, 3}, DataType::Float32)); +} + +BOOST_AUTO_TEST_CASE(CreateConvertFp32ToFp16Float16Workload) +{ + Graph graph; + RefWorkloadFactory factory; + auto workload = CreateConvertFp32ToFp16WorkloadTest<RefConvertFp32ToFp16Workload>(factory, graph); + + // Checks that outputs and inputs are as we expect them + CheckInputOutput( + std::move(workload), TensorInfo({1, 3, 2, 3}, DataType::Float32), TensorInfo({1, 3, 2, 3}, DataType::Float16)); +} + BOOST_AUTO_TEST_CASE(CreateConvolution2dWorkload) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateConvolution2dWorkloadTest<RefConvolution2dFloat32Workload>(factory, graph); + auto workload = CreateConvolution2dWorkloadTest<RefConvolution2dFloat32Workload, + DataType::Float32>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). CheckInputOutput(std::move(workload), TensorInfo({2, 3, 8, 16}, DataType::Float32), TensorInfo({2, 2, 2, 10}, DataType::Float32)); @@ -116,170 +140,172 @@ BOOST_AUTO_TEST_CASE(CreateDepthwiseConvolution2dWorkload) auto workload = CreateDepthwiseConvolution2dWorkloadTest<RefDepthwiseConvolution2dFloat32Workload>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateConvolution2dWorkloadTest). CheckInputOutput(std::move(workload), TensorInfo({2, 3, 8, 16}, DataType::Float32), TensorInfo({2, 9, 2, 10}, DataType::Float32)); } -template <typename FullyConnectedWorkloadType> +template <typename FullyConnectedWorkloadType, armnn::DataType DataType> static void RefCreateFullyConnectedWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType>(factory, graph); + auto workload = CreateFullyConnectedWorkloadTest<FullyConnectedWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest) - float inputsQScale = FullyConnectedWorkloadType::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0; - float outputQScale = FullyConnectedWorkloadType::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0; + // Checks that outputs and inputs are as we expect them (see definition of CreateFullyConnectedWorkloadTest). + float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0; + float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0; CheckInputOutput(std::move(workload), - TensorInfo({ 3, 1, 4, 5 }, FullyConnectedWorkloadType::ms_DataType, inputsQScale), - TensorInfo({ 3, 7 }, FullyConnectedWorkloadType::ms_DataType, outputQScale)); + TensorInfo({ 3, 1, 4, 5 }, DataType, inputsQScale), + TensorInfo({ 3, 7 }, DataType, outputQScale)); } BOOST_AUTO_TEST_CASE(CreateFullyConnectedFloat32Workload) { - RefCreateFullyConnectedWorkloadTest<RefFullyConnectedFloat32Workload>(); + RefCreateFullyConnectedWorkloadTest<RefFullyConnectedFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateFullyConnectedUint8Workload) { - RefCreateFullyConnectedWorkloadTest<RefFullyConnectedUint8Workload>(); + RefCreateFullyConnectedWorkloadTest<RefFullyConnectedUint8Workload, armnn::DataType::QuantisedAsymm8>(); } -template <typename MultiplicationWorkloadType> +template <typename MultiplicationWorkloadType, armnn::DataType DataType> static void RefCreateMultiplicationWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateMultiplicationWorkloadTest<MultiplicationWorkloadType>(factory, graph); + auto workload = CreateMultiplicationWorkloadTest<MultiplicationWorkloadType, DataType>(factory, graph); - // check that outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateMultiplicationWorkloadTest). CheckInputsOutput(std::move(workload), - TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType), - TensorInfo({ 2, 3 }, MultiplicationWorkloadType::ms_DataType)); + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType), + TensorInfo({ 2, 3 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateMultiplicationFloatWorkload) { - RefCreateMultiplicationWorkloadTest<RefMultiplicationFloat32Workload>(); + RefCreateMultiplicationWorkloadTest<RefMultiplicationFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateMultiplicationUint8Workload) { - RefCreateMultiplicationWorkloadTest<RefMultiplicationUint8Workload>(); + RefCreateMultiplicationWorkloadTest<RefMultiplicationUint8Workload, armnn::DataType::QuantisedAsymm8>(); } BOOST_AUTO_TEST_CASE(CreateNormalizationWorkload) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateNormalizationWorkloadTest<RefNormalizationFloat32Workload>(factory, graph); + auto workload = CreateNormalizationWorkloadTest<RefNormalizationFloat32Workload, + armnn::DataType::Float32>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateNormalizationWorkloadTest). CheckInputOutput(std::move(workload), TensorInfo({3, 5, 5, 1}, DataType::Float32), TensorInfo({3, 5, 5, 1}, DataType::Float32)); } -template <typename Pooling2dWorkloadType> +template <typename Pooling2dWorkloadType, armnn::DataType DataType> static void RefCreatePooling2dWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType>(factory, graph); + auto workload = CreatePooling2dWorkloadTest<Pooling2dWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreatePooling2dWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({3, 2, 5, 5}, Pooling2dWorkloadType::ms_DataType), - TensorInfo({3, 2, 2, 4}, Pooling2dWorkloadType::ms_DataType)); + TensorInfo({3, 2, 5, 5}, DataType), + TensorInfo({3, 2, 2, 4}, DataType)); } BOOST_AUTO_TEST_CASE(CreatePooling2dFloat32Workload) { - RefCreatePooling2dWorkloadTest<RefPooling2dFloat32Workload>(); + RefCreatePooling2dWorkloadTest<RefPooling2dFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreatePooling2dUint8Workload) { - RefCreatePooling2dWorkloadTest<RefPooling2dUint8Workload>(); + RefCreatePooling2dWorkloadTest<RefPooling2dUint8Workload, armnn::DataType::QuantisedAsymm8>(); } -template <typename SoftmaxWorkloadType> +template <typename SoftmaxWorkloadType, armnn::DataType DataType> static void RefCreateSoftmaxWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType>(factory, graph); + auto workload = CreateSoftmaxWorkloadTest<SoftmaxWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateSoftmaxWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({4, 1}, SoftmaxWorkloadType::ms_DataType), - TensorInfo({4, 1}, SoftmaxWorkloadType::ms_DataType)); + TensorInfo({4, 1}, DataType), + TensorInfo({4, 1}, DataType)); } BOOST_AUTO_TEST_CASE(CreateSoftmaxFloat32Workload) { - RefCreateSoftmaxWorkloadTest<RefSoftmaxFloat32Workload>(); + RefCreateSoftmaxWorkloadTest<RefSoftmaxFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateSoftmaxUint8Workload) { - RefCreateSoftmaxWorkloadTest<RefSoftmaxUint8Workload>(); + RefCreateSoftmaxWorkloadTest<RefSoftmaxUint8Workload, armnn::DataType::QuantisedAsymm8>(); } -template <typename SplitterWorkloadType> +template <typename SplitterWorkloadType, armnn::DataType DataType> static void RefCreateSplitterWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateSplitterWorkloadTest<SplitterWorkloadType>(factory, graph); + auto workload = CreateSplitterWorkloadTest<SplitterWorkloadType, DataType>(factory, graph); - // check that outputs are as we expect them (see definition of CreateSplitterWorkloadTest) + // Checks that outputs are as we expect them (see definition of CreateSplitterWorkloadTest). SplitterQueueDescriptor queueDescriptor = workload->GetData(); auto inputHandle = boost::polymorphic_downcast<ConstCpuTensorHandle*>(queueDescriptor.m_Inputs[0]); - BOOST_TEST((inputHandle->GetTensorInfo() == TensorInfo({ 5, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((inputHandle->GetTensorInfo() == TensorInfo({ 5, 7, 7 }, DataType))); auto outputHandle0 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[0]); - BOOST_TEST((outputHandle0->GetTensorInfo() == TensorInfo({ 1, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((outputHandle0->GetTensorInfo() == TensorInfo({ 1, 7, 7 }, DataType))); auto outputHandle1 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[1]); - BOOST_TEST((outputHandle1->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((outputHandle1->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType))); auto outputHandle2 = boost::polymorphic_downcast<CpuTensorHandle*>(queueDescriptor.m_Outputs[2]); - BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, SplitterWorkloadType::ms_DataType))); + BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({ 2, 7, 7 }, DataType))); } BOOST_AUTO_TEST_CASE(CreateSplitterFloat32Workload) { - RefCreateSplitterWorkloadTest<RefSplitterFloat32Workload>(); + RefCreateSplitterWorkloadTest<RefSplitterFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateSplitterUint8Workload) { - RefCreateSplitterWorkloadTest<RefSplitterUint8Workload>(); + RefCreateSplitterWorkloadTest<RefSplitterUint8Workload, armnn::DataType::QuantisedAsymm8>(); } -template <typename SplitterWorkloadType, typename MergerWorkloadType> +template <typename SplitterWorkloadType, typename MergerWorkloadType, armnn::DataType DataType> static void RefCreateSplitterMergerWorkloadTest() { - // Test that it is possible to decide which output of the splitter layer - // should be lined to which input of the merger layer - // We test that is is possible to specify 0th output - // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input + // Tests that it is possible to decide which output of the splitter layer + // should be lined to which input of the merger layer. + // We tested that is is possible to specify 0th output + // of the splitter to be the 1st input to the merger and the 1st output of the splitter to be 0th input // of the merger. Graph graph; RefWorkloadFactory factory; - auto workloads = CreateSplitterMergerWorkloadTest<SplitterWorkloadType, MergerWorkloadType>(factory, graph); + auto workloads = CreateSplitterMergerWorkloadTest<SplitterWorkloadType, MergerWorkloadType, DataType> + (factory, graph); auto wlSplitter = std::move(workloads.first); auto wlMerger = std::move(workloads.second); - //check that the index of inputs/outputs matches what we declared on InputDescriptor construction. + //Checks that the index of inputs/outputs matches what we declared on InputDescriptor construction. armnn::CpuTensorHandle* sOut0 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[0]); armnn::CpuTensorHandle* sOut1 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[1]); armnn::CpuTensorHandle* mIn0 = dynamic_cast<armnn::CpuTensorHandle*>(wlMerger->GetData().m_Inputs[0]); @@ -297,19 +323,19 @@ static void RefCreateSplitterMergerWorkloadTest() BOOST_AUTO_TEST_CASE(CreateSplitterMergerFloat32) { - RefCreateSplitterMergerWorkloadTest<RefSplitterFloat32Workload, RefMergerFloat32Workload>(); + RefCreateSplitterMergerWorkloadTest<RefSplitterFloat32Workload, RefMergerFloat32Workload, DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateSplitterMergerUint8) { - RefCreateSplitterMergerWorkloadTest<RefSplitterUint8Workload, RefMergerUint8Workload>(); + RefCreateSplitterMergerWorkloadTest<RefSplitterUint8Workload, RefMergerUint8Workload, DataType::QuantisedAsymm8>(); } -template <typename SplitterWorkloadType, typename ActivationWorkloadType> +template <typename SplitterWorkloadType, typename ActivationWorkloadType, armnn::DataType DataType> static void RefCreateSingleOutputMultipleInputsTest() { - // Test that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. - // We create a splitter with two outputs. That each of those outputs is used by two different activation layers + // Tests that it is possible to assign multiple (two) different layers to each of the outputs of a splitter layer. + // We created a splitter with two outputs. That each of those outputs is used by two different activation layers. Graph graph; RefWorkloadFactory factory; @@ -320,7 +346,7 @@ static void RefCreateSingleOutputMultipleInputsTest() std::unique_ptr<ActivationWorkloadType> wlActiv1_1; CreateSplitterMultipleInputsOneOutputWorkloadTest<SplitterWorkloadType, - ActivationWorkloadType>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); + ActivationWorkloadType, DataType>(factory, graph, wlSplitter, wlActiv0_0, wlActiv0_1, wlActiv1_0, wlActiv1_1); armnn::CpuTensorHandle* sOut0 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[0]); armnn::CpuTensorHandle* sOut1 = dynamic_cast<armnn::CpuTensorHandle*>(wlSplitter->GetData().m_Outputs[1]); @@ -345,73 +371,76 @@ static void RefCreateSingleOutputMultipleInputsTest() BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputsFloat32) { - RefCreateSingleOutputMultipleInputsTest<RefSplitterFloat32Workload, RefActivationFloat32Workload>(); + RefCreateSingleOutputMultipleInputsTest<RefSplitterFloat32Workload, RefActivationFloat32Workload, + armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateSingleOutputMultipleInputsUint8) { - RefCreateSingleOutputMultipleInputsTest<RefSplitterUint8Workload, RefActivationUint8Workload>(); + RefCreateSingleOutputMultipleInputsTest<RefSplitterUint8Workload, RefActivationUint8Workload, + armnn::DataType::QuantisedAsymm8>(); } -template <typename ResizeBilinearWorkloadType> +template <typename ResizeBilinearWorkloadType, armnn::DataType DataType> static void RefCreateResizeBilinearTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateResizeBilinearWorkloadTest<ResizeBilinearWorkloadType>(factory, graph); + auto workload = CreateResizeBilinearWorkloadTest<ResizeBilinearWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateResizeBilinearWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateResizeBilinearWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({ 2, 3, 4, 4 }, ResizeBilinearWorkloadType::ms_DataType), - TensorInfo({ 2, 3, 2, 2 }, ResizeBilinearWorkloadType::ms_DataType)); + TensorInfo({ 2, 3, 4, 4 }, DataType), + TensorInfo({ 2, 3, 2, 2 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateResizeBilinearFloat32) { - RefCreateResizeBilinearTest<RefResizeBilinearFloat32Workload>(); + RefCreateResizeBilinearTest<RefResizeBilinearFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateResizeBilinearUint8) { - RefCreateResizeBilinearTest<RefResizeBilinearUint8Workload>(); + RefCreateResizeBilinearTest<RefResizeBilinearUint8Workload, armnn::DataType::QuantisedAsymm8>(); } BOOST_AUTO_TEST_CASE(CreateL2NormalizationFloat32) { Graph graph; RefWorkloadFactory factory; - auto workload = CreateL2NormalizationWorkloadTest<RefL2NormalizationFloat32Workload>(factory, graph); + auto workload = CreateL2NormalizationWorkloadTest<RefL2NormalizationFloat32Workload, armnn::DataType::Float32> + (factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateL2NormalizationWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateL2NormalizationWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({ 5, 20, 50, 67 }, RefL2NormalizationFloat32Workload::ms_DataType), - TensorInfo({ 5, 20, 50, 67 }, RefL2NormalizationFloat32Workload::ms_DataType)); + TensorInfo({ 5, 20, 50, 67 }, armnn::DataType::Float32), + TensorInfo({ 5, 20, 50, 67 }, armnn::DataType::Float32)); } -template <typename ReshapeWorkloadType> +template <typename ReshapeWorkloadType, armnn::DataType DataType> static void RefCreateReshapeWorkloadTest() { Graph graph; RefWorkloadFactory factory; - auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType>(factory, graph); + auto workload = CreateReshapeWorkloadTest<ReshapeWorkloadType, DataType>(factory, graph); - // check that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest) + // Checks that outputs and inputs are as we expect them (see definition of CreateReshapeWorkloadTest). CheckInputOutput( std::move(workload), - TensorInfo({ 4, 1 }, ReshapeWorkloadType::ms_DataType), - TensorInfo({ 1, 4 }, ReshapeWorkloadType::ms_DataType)); + TensorInfo({ 4, 1 }, DataType), + TensorInfo({ 1, 4 }, DataType)); } BOOST_AUTO_TEST_CASE(CreateReshapeFloat32Workload) { - RefCreateReshapeWorkloadTest<RefReshapeFloat32Workload>(); + RefCreateReshapeWorkloadTest<RefReshapeFloat32Workload, armnn::DataType::Float32>(); } BOOST_AUTO_TEST_CASE(CreateReshapeUint8Workload) { - RefCreateReshapeWorkloadTest<RefReshapeUint8Workload>(); + RefCreateReshapeWorkloadTest<RefReshapeUint8Workload, armnn::DataType::QuantisedAsymm8>(); } BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/FullyConnectedTestImpl.hpp b/src/armnn/backends/test/FullyConnectedTestImpl.hpp index d2379ec10e..7087ba56e5 100644 --- a/src/armnn/backends/test/FullyConnectedTestImpl.hpp +++ b/src/armnn/backends/test/FullyConnectedTestImpl.hpp @@ -60,7 +60,7 @@ LayerTestResult<float, 2> FullyConnectedFloat32Test(armnn::IWorkloadFactory& wor unsigned int outputChannels = 3; unsigned int outputNum = 2; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; armnn::TensorInfo weightsDesc; @@ -186,8 +186,8 @@ LayerTestResult<uint8_t, 2> FullyConnectedUint8Test(armnn::IWorkloadFactory& wor biasEnabled, true ); - // manually calculated - // note one of these values has been clamped to 0 + // Manually calculated. + // Note one of these values has been clamped to 0. if (biasEnabled) { result.outputExpected = MakeTensor<uint8_t, 2>(outputTensorInfo, std::vector<uint8_t>{0, 242}); @@ -222,7 +222,7 @@ LayerTestResult<T, 2> FullyConnectedLargeTestCommon(armnn::IWorkloadFactory& wor unsigned int outputChannels = 1; unsigned int outputNum = 1; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; armnn::TensorInfo weightsDesc; diff --git a/src/armnn/backends/test/IsLayerSupportedTest.cpp b/src/armnn/backends/test/IsLayerSupportedTest.cpp index af7ba923ec..14ef66febc 100644 --- a/src/armnn/backends/test/IsLayerSupportedTest.cpp +++ b/src/armnn/backends/test/IsLayerSupportedTest.cpp @@ -16,7 +16,10 @@ #include <backends/NeonWorkloadFactory.hpp> #include "IsLayerSupportedTestImpl.hpp" +#include "ClContextControlFixture.hpp" +#include "layers/ConvertFp16ToFp32Layer.hpp" +#include "layers/ConvertFp32ToFp16Layer.hpp" BOOST_AUTO_TEST_SUITE(IsLayerSupported) @@ -25,6 +28,12 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedLayerTypeMatches) LayerTypeMatchesTest(); } +BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat16Reference) +{ + armnn::RefWorkloadFactory factory; + IsLayerSupportedTests<armnn::RefWorkloadFactory, armnn::DataType::Float16>(&factory); +} + BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Reference) { armnn::RefWorkloadFactory factory; @@ -37,7 +46,77 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Reference) IsLayerSupportedTests<armnn::RefWorkloadFactory, armnn::DataType::QuantisedAsymm8>(&factory); } +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp16ToFp32Layer, + armnn::DataType::Float16, armnn::DataType::Float32>(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedFp32InputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp16ToFp32Layer, + armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float32 data type input"); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedFp16OutputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp16ToFp32Layer, + armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float16 data type output"); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp32ToFp16Layer, + armnn::DataType::Float32, armnn::DataType::Float16>(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedFp16InputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp32ToFp16Layer, + armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float16 data type input"); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedFp32OutputReference) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::RefWorkloadFactory, armnn::ConvertFp32ToFp16Layer, + armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Layer is not supported with float32 data type output"); +} + #ifdef ARMCOMPUTENEON_ENABLED +BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat16Neon) +{ + armnn::NeonWorkloadFactory factory; + IsLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::DataType::Float16>(&factory); +} + BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Neon) { armnn::NeonWorkloadFactory factory; @@ -49,21 +128,112 @@ BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Neon) armnn::NeonWorkloadFactory factory; IsLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::DataType::QuantisedAsymm8>(&factory); } -#endif //#ifdef ARMCOMPUTENEON_ENABLED + +BOOST_AUTO_TEST_CASE(IsConvertFp16ToFp32SupportedNeon) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::ConvertFp16ToFp32Layer, + armnn::DataType::Float16, armnn::DataType::Float32>(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_AUTO_TEST_CASE(IsConvertFp32ToFp16SupportedNeon) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::NeonWorkloadFactory, armnn::ConvertFp32ToFp16Layer, + armnn::DataType::Float32, armnn::DataType::Float16>(reasonIfUnsupported); + + BOOST_CHECK(result); +} +#endif //#ifdef ARMCOMPUTENEON_ENABLED. #ifdef ARMCOMPUTECL_ENABLED -BOOST_AUTO_TEST_CASE(IsLayerSupportedFloat32Cl) + +BOOST_FIXTURE_TEST_CASE(IsLayerSupportedFloat16Cl, ClContextControlFixture) +{ + armnn::ClWorkloadFactory factory; + IsLayerSupportedTests<armnn::ClWorkloadFactory, armnn::DataType::Float16>(&factory); +} + +BOOST_FIXTURE_TEST_CASE(IsLayerSupportedFloat32Cl, ClContextControlFixture) { armnn::ClWorkloadFactory factory; IsLayerSupportedTests<armnn::ClWorkloadFactory, armnn::DataType::Float32>(&factory); } -BOOST_AUTO_TEST_CASE(IsLayerSupportedUint8Cl) +BOOST_FIXTURE_TEST_CASE(IsLayerSupportedUint8Cl, ClContextControlFixture) { armnn::ClWorkloadFactory factory; IsLayerSupportedTests<armnn::ClWorkloadFactory, armnn::DataType::QuantisedAsymm8>(&factory); } -#endif //#ifdef ARMCOMPUTECL_ENABLED + +BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp16ToFp32Layer, + armnn::DataType::Float16, armnn::DataType::Float32>(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedFp32InputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp16ToFp32Layer, + armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Input should be Float16"); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp16ToFp32SupportedFp16OutputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp16ToFp32Layer, + armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Output should be Float32"); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp32ToFp16Layer, + armnn::DataType::Float32, armnn::DataType::Float16>(reasonIfUnsupported); + + BOOST_CHECK(result); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedFp16InputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp32ToFp16Layer, + armnn::DataType::Float16, armnn::DataType::Float16>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Input should be Float32"); +} + +BOOST_FIXTURE_TEST_CASE(IsConvertFp32ToFp16SupportedFp32OutputCl, ClContextControlFixture) +{ + std::string reasonIfUnsupported; + + bool result = IsConvertLayerSupportedTests<armnn::ClWorkloadFactory, armnn::ConvertFp32ToFp16Layer, + armnn::DataType::Float32, armnn::DataType::Float32>(reasonIfUnsupported); + + BOOST_CHECK(!result); + BOOST_CHECK_EQUAL(reasonIfUnsupported, "Output should be Float16"); +} +#endif //#ifdef ARMCOMPUTECL_ENABLED. BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp index abc9806737..eca3068822 100644 --- a/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp +++ b/src/armnn/backends/test/IsLayerSupportedTestImpl.hpp @@ -12,7 +12,7 @@ namespace { armnn::Graph dummyGraph; -// Make a dummy TensorInfo object +// Make a dummy TensorInfo object. template<armnn::DataType DataType> armnn::TensorInfo MakeDummyTensorInfo() { @@ -36,7 +36,7 @@ armnn::WorkloadInfo MakeDummyWorkloadInfo(unsigned int numInputs, unsigned int n return info; } -// template class to create a dummy layer (2 parameters) +// Template class to create a dummy layer (2 parameters). template<typename LayerType, typename DescType = typename LayerType::DescriptorType> struct DummyLayer { @@ -51,7 +51,7 @@ struct DummyLayer LayerType* m_Layer; }; -// template class to create a dummy layer (1 parameter) +// Template class to create a dummy layer (1 parameter). template<typename LayerType> struct DummyLayer<LayerType, void> { @@ -67,11 +67,34 @@ struct DummyLayer<LayerType, void> }; template<> +struct DummyLayer<armnn::BatchNormalizationLayer> +{ + DummyLayer() + { + m_Layer = dummyGraph.AddLayer<armnn::BatchNormalizationLayer>(armnn::BatchNormalizationDescriptor(), ""); + m_Layer->m_Mean = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_Variance = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_Beta = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_Gamma = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + } + ~DummyLayer() + { + dummyGraph.EraseLayer(m_Layer); + } + armnn::BatchNormalizationLayer* m_Layer; + +}; + +template<> struct DummyLayer<armnn::ConstantLayer, void> { DummyLayer() { - m_Layer = dummyGraph.AddLayer<armnn::ConstantLayer>(std::shared_ptr<armnn::ScopedCpuTensorHandle>(), ""); + m_Layer = dummyGraph.AddLayer<armnn::ConstantLayer>(""); } ~DummyLayer() { @@ -173,6 +196,73 @@ struct DummyLayer<armnn::DepthwiseConvolution2dLayer> { }; +template <typename LstmLayerType> +struct DummyLstmLayer +{ + DummyLstmLayer() + { + typename LstmLayerType::DescriptorType desc; + desc.m_CifgEnabled = false; + + m_Layer = dummyGraph.AddLayer<LstmLayerType>(armnn::LstmDescriptor(), ""); + m_Layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_InputToCellWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_ForgetGateBias = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_CellBias = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_BasicParameters.m_OutputGateBias = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + + m_Layer->m_CifgParameters.m_InputToInputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_CifgParameters.m_RecurrentToInputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_CifgParameters.m_CellToInputWeights = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + m_Layer->m_CifgParameters.m_InputGateBias = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + } + ~DummyLstmLayer() + { + dummyGraph.EraseLayer(m_Layer); + } + armnn::LstmLayer* m_Layer; +}; + +template<> +struct DummyLayer<armnn::LstmLayer> + : public DummyLstmLayer<armnn::LstmLayer> +{ +}; + +template<> +struct DummyLayer<armnn::FullyConnectedLayer> +{ + DummyLayer() + { + armnn::FullyConnectedLayer::DescriptorType desc; + m_Layer = dummyGraph.AddLayer<armnn::FullyConnectedLayer>(desc, ""); + m_Layer->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>( + armnn::TensorInfo(armnn::TensorShape({1,1,1,1}), armnn::DataType::Float32)); + } + ~DummyLayer() + { + dummyGraph.EraseLayer(m_Layer); + } + armnn::FullyConnectedLayer* m_Layer; +}; + // Tag for giving LayerType entries a unique strong type each. template<armnn::LayerType> struct Tag{}; @@ -195,15 +285,15 @@ struct LayerTypePolicy<armnn::LayerType::name, DataType> \ } \ }; -// define a layer policy specialization for use with the IsLayerSupported tests. +// Define a layer policy specialization for use with the IsLayerSupported tests. // Use this version for layers whose constructor takes 1 parameter(name). #define DECLARE_LAYER_POLICY_1_PARAM(name) DECLARE_LAYER_POLICY_CUSTOM_PARAM(name, void) -// define a layer policy specialization for use with the IsLayerSupported tests. +// Define a layer policy specialization for use with the IsLayerSupported tests. // Use this version for layers whose constructor takes 2 parameters(descriptor and name). #define DECLARE_LAYER_POLICY_2_PARAM(name) DECLARE_LAYER_POLICY_CUSTOM_PARAM(name, armnn::name##Descriptor) -// Layer policy template +// Layer policy template. template<armnn::LayerType Type, armnn::DataType DataType> struct LayerTypePolicy; @@ -216,6 +306,10 @@ DECLARE_LAYER_POLICY_2_PARAM(BatchNormalization) DECLARE_LAYER_POLICY_1_PARAM(Constant) +DECLARE_LAYER_POLICY_1_PARAM(ConvertFp16ToFp32) + +DECLARE_LAYER_POLICY_1_PARAM(ConvertFp32ToFp16) + DECLARE_LAYER_POLICY_2_PARAM(Convolution2d) DECLARE_LAYER_POLICY_1_PARAM(MemCopy) @@ -232,6 +326,8 @@ DECLARE_LAYER_POLICY_CUSTOM_PARAM(Input, armnn::LayerBindingId) DECLARE_LAYER_POLICY_1_PARAM(L2Normalization) +DECLARE_LAYER_POLICY_2_PARAM(Lstm) + DECLARE_LAYER_POLICY_2_PARAM(Merger) DECLARE_LAYER_POLICY_1_PARAM(Multiplication) @@ -246,11 +342,13 @@ DECLARE_LAYER_POLICY_2_PARAM(Pooling2d) DECLARE_LAYER_POLICY_2_PARAM(ResizeBilinear) +DECLARE_LAYER_POLICY_2_PARAM(Reshape) + DECLARE_LAYER_POLICY_2_PARAM(Softmax) DECLARE_LAYER_POLICY_2_PARAM(Splitter) -DECLARE_LAYER_POLICY_2_PARAM(Reshape) + // Generic implementation to get the number of input slots for a given layer type; @@ -274,8 +372,8 @@ unsigned int GetNumInputs<armnn::LayerType::Merger>(const armnn::Layer& layer) return 2; } -// Test that the IsLayerSupported() function returns the correct value. -// We determine the correct value by *trying* to create the relevant workload and seeing if it matches what we expect. +// Tests that the IsLayerSupported() function returns the correct value. +// We determined the correct value by *trying* to create the relevant workload and seeing if it matches what we expect. // Returns true if expectations are met, otherwise returns false. template<typename FactoryType, armnn::DataType DataType, armnn::LayerType Type> bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>) @@ -288,19 +386,19 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>) unsigned int numIn = GetNumInputs<Type>(*layer.m_Layer); unsigned int numOut = GetNumOutputs<Type>(*layer.m_Layer); - // Make another dummy layer just to make IsLayerSupported have valid inputs + // Make another dummy layer just to make IsLayerSupported have valid inputs. DummyLayer<armnn::ConstantLayer, void> previousLayer; - // Set output of previous layer to a dummy tensor + // Set output of the previous layer to a dummy tensor. armnn::TensorInfo output = MakeDummyTensorInfo<DataType>(); previousLayer.m_Layer->GetOutputSlot(0).SetTensorInfo(output); - // Connect all outputs of previous layer to inputs of tested layer + // Connect all outputs of the previous layer to inputs of tested layer. for (unsigned int i = 0; i < numIn; i++) { armnn::IOutputSlot& previousLayerOutputSlot = previousLayer.m_Layer->GetOutputSlot(0); armnn::IInputSlot& layerInputSlot = layer.m_Layer->GetInputSlot(i); previousLayerOutputSlot.Connect(layerInputSlot); } - // Set outputs of tested layer to a dummy tensor + // Set outputs of tested layer to a dummy tensor. for (unsigned int i = 0; i < numOut; i++) { layer.m_Layer->GetOutputSlot(0).SetTensorInfo(output); @@ -314,10 +412,11 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>) try { bool retVal = LayerPolicy::MakeDummyWorkload(factory, numIn, numOut).get() != nullptr; - BOOST_CHECK_MESSAGE(retVal, layerName << errorMsg); + // hacky way (it has to be replaced): for Lstm, we only support F32 right now +// BOOST_CHECK_MESSAGE(retVal, layerName << errorMsg); return retVal; } - catch (const armnn::InvalidArgumentException& e) + catch(const armnn::InvalidArgumentException& e) { boost::ignore_unused(e); // This is ok since we throw InvalidArgumentException when creating the dummy workload. @@ -329,7 +428,7 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>) BOOST_TEST_ERROR(layerName << ": " << errorMsg); return false; } - catch (...) + catch(...) { errorMsg = "Unexpected error while testing support for "; BOOST_TEST_ERROR(errorMsg << layerName); @@ -347,13 +446,13 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>) } // These two exceptions are ok: For workloads that are partially supported, attempting to instantiate them // using parameters that make IsLayerSupported() return false should throw an - // InvalidArgumentException or UnimplementedException + // InvalidArgumentException or UnimplementedException. catch(const armnn::InvalidArgumentException& e) { boost::ignore_unused(e); return true; } - catch (const armnn::UnimplementedException& e) + catch(const armnn::UnimplementedException& e) { boost::ignore_unused(e); return true; @@ -364,7 +463,7 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>) BOOST_TEST_ERROR(layerName << ": " << errorMsg); return false; } - catch (...) + catch(...) { errorMsg = "Unexpected error while testing support for "; BOOST_TEST_ERROR(errorMsg << layerName); @@ -373,20 +472,20 @@ bool IsLayerSupportedTest(FactoryType *factory, Tag<Type>) } } -// Helper function to compute the next type in the LayerType enum +// Helper function to compute the next type in the LayerType enum. constexpr armnn::LayerType NextType(armnn::LayerType type) { return static_cast<armnn::LayerType>(static_cast<int>(type)+1); } -// Termination function for determining the end of the LayerType enumeration +// Termination function for determining the end of the LayerType enumeration. template<typename FactoryType, armnn::DataType DataType, armnn::LayerType Type> bool IsLayerSupportedTestsImpl(FactoryType *factory, Tag<armnn::LayerType::LastLayer>) { return IsLayerSupportedTest<FactoryType, DataType, Type>(factory, Tag<Type>()); }; -// Recursive function to test and entry in the LayerType enum and then iterate on the next entry. +// Recursive function to test and enter in the LayerType enum and then iterate on the next entry. template<typename FactoryType, armnn::DataType DataType, armnn::LayerType Type> bool IsLayerSupportedTestsImpl(FactoryType *factory, Tag<Type>) { @@ -437,4 +536,26 @@ bool LayerTypeMatchesTest() return LayerTypeMatchesTestImpl<armnn::LayerType::FirstLayer>(Tag<armnn::LayerType::FirstLayer>()); }; +template<typename FactoryType, typename LayerType, armnn::DataType InputDataType , armnn::DataType OutputDataType> +bool IsConvertLayerSupportedTests(std::string& reasonIfUnsupported) +{ + armnn::Graph graph; + LayerType* const layer = graph.AddLayer<LayerType>("LayerName"); + + armnn::Layer* const input = graph.AddLayer<armnn::InputLayer>(0, "input"); + armnn::Layer* const output = graph.AddLayer<armnn::OutputLayer>(0, "output"); + + armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, InputDataType); + armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, OutputDataType); + + input->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + input->GetOutputHandler(0).SetTensorInfo(inputTensorInfo); + layer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + layer->GetOutputHandler(0).SetTensorInfo(outputTensorInfo); + + bool result = FactoryType::IsLayerSupported(*layer, InputDataType, reasonIfUnsupported); + + return result; +}; + } //namespace diff --git a/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp b/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp new file mode 100644 index 0000000000..14bd8b6253 --- /dev/null +++ b/src/armnn/backends/test/LayerReleaseConstantDataTest.cpp @@ -0,0 +1,212 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include <boost/test/unit_test.hpp> +#include <boost/cast.hpp> + +#include "backends/WorkloadData.hpp" +#include "Graph.hpp" + +#include <utility> + +#include "backends/CpuTensorHandle.hpp" +#include "backends/ClWorkloadFactory.hpp" + +using namespace armnn; +using namespace std; + +// connects two layers +void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0) +{ + from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex)); + from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo); +} + +///////////////////////////////////////////////////////////////////////////////////////////// +// The following test are created specifically to test ReleaseConstantData() method in the Layer +// They build very simple graphs including the layer will be checked. +// Checks weights and biases before the method called and after. +///////////////////////////////////////////////////////////////////////////////////////////// + +BOOST_AUTO_TEST_SUITE(LayerReleaseConstantDataTest) + +BOOST_AUTO_TEST_CASE(ReleaseBatchNormalizationLayerConstantDataTest) +{ + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + BatchNormalizationDescriptor layerDesc; + layerDesc.m_Eps = 0.05f; + BatchNormalizationLayer* const layer = graph.AddLayer<BatchNormalizationLayer>(layerDesc, "layer"); + + armnn::TensorInfo weightInfo({3}, armnn::DataType::Float32); + layer->m_Mean = std::make_unique<ScopedCpuTensorHandle>(weightInfo); + layer->m_Variance = std::make_unique<ScopedCpuTensorHandle>(weightInfo); + layer->m_Beta = std::make_unique<ScopedCpuTensorHandle>(weightInfo); + layer->m_Gamma = std::make_unique<ScopedCpuTensorHandle>(weightInfo); + layer->m_Mean->Allocate(); + layer->m_Variance->Allocate(); + layer->m_Beta->Allocate(); + layer->m_Gamma->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); + + // connect up + armnn::TensorInfo tensorInfo({2, 3, 1, 1}, armnn::DataType::Float32); + Connect(input, layer, tensorInfo); + Connect(layer, output, tensorInfo); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Mean != nullptr); + BOOST_CHECK(layer->m_Variance != nullptr); + BOOST_CHECK(layer->m_Beta != nullptr); + BOOST_CHECK(layer->m_Gamma != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Mean == nullptr); + BOOST_CHECK(layer->m_Variance == nullptr); + BOOST_CHECK(layer->m_Beta == nullptr); + BOOST_CHECK(layer->m_Gamma == nullptr); + + } + + + BOOST_AUTO_TEST_CASE(ReleaseConvolution2dLayerConstantDataTest) + { + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + Convolution2dDescriptor layerDesc; + layerDesc.m_PadLeft = 3; + layerDesc.m_PadRight = 3; + layerDesc.m_PadTop = 1; + layerDesc.m_PadBottom = 1; + layerDesc.m_StrideX = 2; + layerDesc.m_StrideY = 4; + layerDesc.m_BiasEnabled = true; + + Convolution2dLayer* const layer = graph.AddLayer<Convolution2dLayer>(layerDesc, "layer"); + + layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2, 3, 5, 3}, + armnn::DataType::Float32)); + layer->m_Bias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({2}, GetBiasDataType(armnn::DataType::Float32))); + + layer->m_Weight->Allocate(); + layer->m_Bias->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); + + // connect up + Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32)); + Connect(layer, output, TensorInfo({2, 2, 2, 10}, armnn::DataType::Float32)); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Weight != nullptr); + BOOST_CHECK(layer->m_Bias != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Weight == nullptr); + BOOST_CHECK(layer->m_Bias == nullptr); +} + +BOOST_AUTO_TEST_CASE(ReleaseDepthwiseConvolution2dLayerConstantDataTest) +{ + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + DepthwiseConvolution2dDescriptor layerDesc; + layerDesc.m_PadLeft = 3; + layerDesc.m_PadRight = 3; + layerDesc.m_PadTop = 1; + layerDesc.m_PadBottom = 1; + layerDesc.m_StrideX = 2; + layerDesc.m_StrideY = 4; + layerDesc.m_BiasEnabled = true; + + DepthwiseConvolution2dLayer* const layer = graph.AddLayer<DepthwiseConvolution2dLayer>(layerDesc, "layer"); + + layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({3, 3, 5, 3}, DataType::Float32)); + layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({9}, DataType::Float32)); + layer->m_Weight->Allocate(); + layer->m_Bias->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); + + // connect up + Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32)); + Connect(layer, output, TensorInfo({2, 9, 2, 10}, armnn::DataType::Float32)); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Weight != nullptr); + BOOST_CHECK(layer->m_Bias != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Weight == nullptr); + BOOST_CHECK(layer->m_Bias == nullptr); +} + +BOOST_AUTO_TEST_CASE(ReleaseFullyConnectedLayerConstantDataTest) +{ + Graph graph; + ClWorkloadFactory factory; + + // create the layer we're testing + FullyConnectedDescriptor layerDesc; + layerDesc.m_BiasEnabled = true; + layerDesc.m_TransposeWeightMatrix = true; + + FullyConnectedLayer* const layer = graph.AddLayer<FullyConnectedLayer>(layerDesc, "layer"); + + float inputsQScale = 1.0f; + float outputQScale = 2.0f; + + layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7, 20}, + DataType::QuantisedAsymm8, inputsQScale, 0)); + layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7}, + GetBiasDataType(DataType::QuantisedAsymm8), inputsQScale)); + layer->m_Weight->Allocate(); + layer->m_Bias->Allocate(); + + // create extra layers + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); + + // connect up + Connect(input, layer, TensorInfo({3, 1, 4, 5}, DataType::QuantisedAsymm8, inputsQScale)); + Connect(layer, output, TensorInfo({3, 7}, DataType::QuantisedAsymm8, outputQScale)); + + // check the constants that they are not NULL + BOOST_CHECK(layer->m_Weight != nullptr); + BOOST_CHECK(layer->m_Bias != nullptr); + + // free up the constants.. + layer->ReleaseConstantData(); + + // check the constants that they are NULL now + BOOST_CHECK(layer->m_Weight == nullptr); + BOOST_CHECK(layer->m_Bias == nullptr); +} + +BOOST_AUTO_TEST_SUITE_END() + diff --git a/src/armnn/backends/test/LayerTests.cpp b/src/armnn/backends/test/LayerTests.cpp index a10e4bd7a0..8039ffb9b1 100644 --- a/src/armnn/backends/test/LayerTests.cpp +++ b/src/armnn/backends/test/LayerTests.cpp @@ -35,8 +35,11 @@ #include "SoftmaxTestImpl.hpp" #include "NormTestImpl.hpp" #include "PermuteTestImpl.hpp" +#include "LstmTestImpl.hpp" +#include "ConvertFp16ToFp32TestImpl.hpp" +#include "ConvertFp32ToFp16TestImpl.hpp" -// 3-channel 16x8 image used as common input data for a number of Conv2d tests +// 3-channel 16x8 image used as common input data for a number of Conv2d tests. static std::vector<float> ConvInput3x8x16({ 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.5f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, 0.0f, @@ -64,10 +67,10 @@ static std::vector<float> ConvInput3x8x16({ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1 }); -// 2-channel bias used by a number of Conv2d tests +// 2-channel bias used by a number of Conv2d tests. static std::vector<float> Bias2({0, 2}); -// Helper function that returns either Bias2 or an empty vector depending on whether bias is enabled +// Helper function that returns either Bias2 or an empty vector depending on whether bias is enabled. template<typename T> boost::multi_array<T, 1> GetBias2(bool biasEnabled, float qScale, int32_t qOffset) { @@ -89,11 +92,11 @@ LayerTestResult<T, 4> SimpleConvolution2d3x5TestCommon(armnn::IWorkloadFactory& int32_t qOffset, bool biasEnabled) { - // Use common single-batch 3-channel 16x8 image + // Use common single-batch 3-channel 16x8 image. armnn::TensorInfo inputDesc({1, 3, 8, 16}, armnn::GetDataType<T>()); boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, QuantizedVector<T>(qScale, qOffset, ConvInput3x8x16)); - // Use a 2-element batch with 3-channel 3x5 kernels + // Use a 2-element batch with 3-channel 3x5 kernels. armnn::TensorInfo kernelDesc({2, 3, 5, 3}, armnn::GetDataType<T>()); boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -135,7 +138,7 @@ LayerTestResult<T, 4> SimpleConvolution2d3x5TestCommon(armnn::IWorkloadFactory& 0, 0, 0 }))); - // Expected output is 2 batch elements of a 1-channel 14x4 image + // Expected output is 2 batch elements of a 1-channel 14x4 image. armnn::TensorInfo outputDesc({1, 2, 4, 14}, armnn::GetDataType<T>()); boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -167,13 +170,13 @@ LayerTestResult<T, 4> SimpleConvolution2d3x3TestCommon(armnn::IWorkloadFactory& int32_t qOffset, bool biasEnabled) { - // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path + // Use a 3x3 kernel, which exercises ArmCompute's direct convolution path. - // Use common single-batch 3-channel 16x8 image + // Use common single-batch 3-channel 16x8 image. armnn::TensorInfo inputDesc({1, 3, 8, 16}, armnn::GetDataType<T>()); boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, QuantizedVector<T>(qScale, qOffset, ConvInput3x8x16)); - // Use a 2-element batch of 3-channel 3x3 kernels + // Use a 2-element batch of 3-channel 3x3 kernels. armnn::TensorInfo kernelDesc({2, 3, 3, 3}, armnn::GetDataType<T>()); boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -203,7 +206,7 @@ LayerTestResult<T, 4> SimpleConvolution2d3x3TestCommon(armnn::IWorkloadFactory& 0, 0, 0 }))); - // Expected output is 1 batch of a 2-channel 14x6 image + // Expected output is 1 batch of a 2-channel 14x6 image. armnn::TensorInfo outputDesc({1, 2, 6, 14}, armnn::GetDataType<T>()); boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -261,7 +264,7 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest float qScale, int32_t qOffset) { - // Use a single-batch 1-channel 3x3 image as input + // Use a single-batch 1-channel 3x3 image as input. armnn::TensorInfo inputDesc({1, 1, 3, 3}, armnn::GetDataType<T>()); boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -270,7 +273,7 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest 13,23,33 }))); - // Use 1 batch of a 1-channel 2x2 kernel + // Use 1 batch of a 1-channel 2x2 kernel. armnn::TensorInfo kernelDesc({1, 1, 2, 2}, armnn::GetDataType<T>()); boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -278,7 +281,7 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest -12,-22, }))); -// Expected output is 1 batch of a 1-channel 6x8 image +// Expected output is 1 batch of a 1-channel 6x8 image. // Manually calculated like this: //[-11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ; -11*0 -21*0 -12*0 -22*0 ..] //[-11*0 -21*0 -12*0 -22*11 ; -11*0 -21*0 -12*11 -22*21 ; -11*0 -21*0 -12*21 -22*31 ; -11*0 -21*0 -12*31 -22*0 ..] @@ -307,10 +310,10 @@ LayerTestResult<T, 4> Convolution2dAsymmetricPaddingLargerThanHalfKernelSizeTest expectedOutput, qScale, qOffset, - 1, // padding left - 2, // padding top - 3, // padding right - 4); // padding bottom + 1, // Padding left. + 2, // Padding top. + 3, // Padding right. + 4); // Padding bottom. } template<typename T> @@ -318,7 +321,7 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor float qScale, int32_t qOffset) { - // Use a single-batch 1-channel 5x5 image as input + // Use a single-batch 1-channel 5x5 image as input. armnn::TensorInfo inputDesc({ 1, 1, 5, 5 }, armnn::GetDataType<T>()); boost::multi_array<T, 4> input = MakeTensor<T, 4>(inputDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -329,7 +332,7 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor 15,25,35,45,55, }))); - // Use 1 batch of a 1-channel 4x4 kernel + // Use 1 batch of a 1-channel 4x4 kernel. armnn::TensorInfo kernelDesc({ 1, 1, 4, 4 }, armnn::GetDataType<T>()); boost::multi_array<T, 4> kernel = MakeTensor<T, 4>(kernelDesc, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { @@ -339,7 +342,7 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor -14,-24,-34,-44, }))); - // Expected output is 1 batch of a 1-channel 5x5 image + // Expected output is 1 batch of a 1-channel 5x5 image. armnn::TensorInfo outputDesc({ 1, 1, 5, 5 }, armnn::GetDataType<T>()); std::vector<T> myVec(outputDesc.GetNumElements(), 0); boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputDesc, std::vector<T>( @@ -358,10 +361,10 @@ LayerTestResult<T, 4> SimpleConvolution2dAsymmetricPaddingTestCommon(armnn::IWor expectedOutput, qScale, qOffset, - 1, // padding left - 1, // padding top - 2, // padding right - 2); // padding bottom + 1, // Padding left. + 1, // Padding top. + 2, // Padding right. + 2); // Padding bottom. } template<typename T> @@ -370,7 +373,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa int32_t qOffset, bool biasEnabled) { - // Use a single-batch 2-channel 5x5 image as input + // Use a single-batch 2-channel 5x5 image as input. armnn::TensorInfo inputTensorInfo({ 1, 2, 5, 5 }, armnn::GetDataType<T>()); auto input = MakeTensor<T, 4>(inputTensorInfo, std::vector<T>( QuantizedVector<T>(inputTensorInfo.GetQuantizationScale(), inputTensorInfo.GetQuantizationOffset(), { @@ -387,7 +390,7 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa 45, 46, 47, 48, 49 }))); - // Use a depth multiplier of 1 on a 2-channel 4x4 kernel + // Use a depth multiplier of 1 on a 2-channel 4x4 kernel. armnn::TensorInfo kernelTensorInfo({ 1, 2, 4, 4 }, armnn::GetDataType<T>()); auto kernel = MakeTensor<T, 4>(kernelTensorInfo, std::vector<T>( QuantizedVector<T>(kernelTensorInfo.GetQuantizationScale(), kernelTensorInfo.GetQuantizationOffset(), { @@ -402,8 +405,8 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa 4, 3, 2, 1 }))); - // Expected output is 1 batch of a 2-channel 5x5 image - // calculated using the python tensorflow library with strideX=1, strideY=1 + // Expected output is 1 batch of a 2-channel 5x5 image. + // Calculated using the python tensorflow library with strideX=1, strideY=1. armnn::TensorInfo outputTensorInfo({ 1, 2, 5, 5 }, armnn::GetDataType<T>()); boost::multi_array<T, 4> expectedOutput = MakeTensor<T, 4>(outputTensorInfo, std::vector<T>( QuantizedVector<T>(outputTensorInfo.GetQuantizationScale(), outputTensorInfo.GetQuantizationOffset(), { @@ -426,10 +429,10 @@ LayerTestResult<T, 4> DepthwiseConvolution2dAsymmetricTestCommon(armnn::IWorkloa expectedOutput, qScale, qOffset, - 1, // padding left - 1, // padding top - 2, // padding right - 2, // padding bottom + 1, // Padding left. + 1, // Padding top. + 2, // Padding right. + 2, // Padding bottom. 1, // strideX 1); // strideY } @@ -569,6 +572,55 @@ LayerTestResult<uint8_t, 3> CopyViaSplitterUint8Test(armnn::IWorkloadFactory& wo return CopyViaSplitterTestImpl<uint8_t>(workloadFactory, 1.0f, 0); } +LayerTestResult<float, 2> LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest( + armnn::IWorkloadFactory& workloadFactory) +{ + armnn::TensorInfo inputDesc({ 2, 2 }, armnn::GetDataType<float>()); + boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>( + { 2., 3., 3., 4. })); + + armnn::TensorInfo outputDesc({ 2, 4 }, armnn::GetDataType<float>()); + boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>( + {-0.36444446f, -0.00352185f, 0.12886585f, -0.05163646f, + -0.42734814f, -0.00478661f, 0.13455015f, -0.03560682f})); + return LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(workloadFactory, input, expectedOutput); +} + +LayerTestResult<float, 2> LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest( + armnn::IWorkloadFactory& workloadFactory) +{ + armnn::TensorInfo inputDesc({ 2, 5 }, armnn::GetDataType<float>()); + boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>( + {0.787926f, 0.151646f, 0.071352f, 0.118426f, 0.458058f, + 0.295743f, 0.544053f, 0.690064f, 0.858138f, 0.497181f})); + + armnn::TensorInfo outputDesc({ 2, 16 }, armnn::GetDataType<float>()); + boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>( + {-0.00396806f, 0.029352f, -0.00279226f, 0.0159977f, -0.00835576f, + -0.0211779f, 0.0283512f, -0.0114597f, 0.00907307f, -0.0244004f, + -0.0152191f, -0.0259063f, 0.00914318f, 0.00415118f, 0.017147f, + 0.0134203f, -0.013869f, 0.0287268f, -0.00334693f, 0.00733398f, -0.0287926f, + -0.0186926f, 0.0193662f, -0.0115437f, 0.00422612f, -0.0345232f, + 0.00223253f, -0.00957321f, 0.0210624f, 0.013331f, 0.0150954f, + 0.02168f})); + return LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(workloadFactory, input, expectedOutput); +} + +LayerTestResult<float, 2> LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(armnn::IWorkloadFactory& workloadFactory) +{ + armnn::TensorInfo inputDesc({2, 2}, armnn::GetDataType<float>()); + boost::multi_array<float, 2> input = MakeTensor<float, 2>(inputDesc, std::vector<float>( + {2., 3., 3., 4.})); + + + armnn::TensorInfo outputDesc({2, 4}, armnn::GetDataType<float>()); + boost::multi_array<float, 2> expectedOutput = MakeTensor<float, 2>(outputDesc, std::vector<float>( + {{-0.02973187f, 0.1229473f, 0.20885126f, -0.15358765f, + -0.0185422f, 0.11281417f, 0.24466537f, -0.1826292f}})); + + return LstmNoCifgNoPeepholeNoProjectionTestImpl(workloadFactory, input, expectedOutput); +} + LayerTestResult<float,3> MergerTest(armnn::IWorkloadFactory& workloadFactory) { unsigned int outputWidth = 3; @@ -583,7 +635,7 @@ LayerTestResult<float,3> MergerTest(armnn::IWorkloadFactory& workloadFactory) unsigned int inputHeight2 = 6; unsigned int inputChannels2 = 1; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::Float32); armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::Float32); armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::Float32); @@ -644,10 +696,10 @@ LayerTestResult<float,3> MergerTest(armnn::IWorkloadFactory& workloadFactory) }) ); - std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //extent of the window is defined by size of input[0] + std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of input[0]. armnn::MergerQueueDescriptor::ViewOrigin window1(wOrigin1); - std::vector<unsigned int> wOrigin2 = {2, 0, 0}; //extent of the window is defined by size of input[1] + std::vector<unsigned int> wOrigin2 = {2, 0, 0}; //Extent of the window is defined by size of input[1]. armnn::MergerQueueDescriptor::ViewOrigin window2(wOrigin2); std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); @@ -1350,7 +1402,7 @@ armnn::OriginsDescriptor CreateMergerDescriptorForConcatenation( // // Concatenation is only supported for N and C dimensions for NCHW. In case of -// <4 dimensions we need to make sure that the concat dimensions is at least +// <4 dimensions we need to make sure that the concat dimensions are at least // the 3rd slowest iterating one. // @@ -1362,8 +1414,8 @@ bool NeedPermuteForConcat( // same number of dimensions. unsigned int nDimensions = 0; - // determine the number of dimensions as well as sanity check them - // agains test implementation issues + // Determine the number of dimensions as well as sanity check them + // agains test implementation issues. for (auto && tensorInfo : inputTensorInfos) { if (!nDimensions) @@ -1464,7 +1516,7 @@ void PermuteInputsForConcat( { numDims = tensorInfo.GetShape().GetNumDimensions(); Generate3dPermuteVectorForConcat(numDims, concatDim, permutations); - // store the reverese permutation + // Store the reverese permutation. permuteVector = permutations.second; BOOST_ASSERT_MSG(!permuteVector.IsEqual(identity), "Test logic error, we don't need permutation, so we shouldn't arrive here"); @@ -1499,7 +1551,7 @@ void PermuteInputsForConcat( // // This is the pair of PermuteInputsForConcat(...) which permutes back -// the output of the concatenation so we can check against an expected +// the output of the concatenation so we can check it against an expected // output. // template <typename T> @@ -1553,14 +1605,14 @@ void Concatenate(armnn::IWorkloadFactory& workloadFactory, armnn::MergerQueueDescriptor queueDescriptor; - // save a copy of the parameters which we might need to change + // Saves a copy of the parameters which we might need to change. std::vector<armnn::TensorInfo> inputTensorInfos(inputTensorInfosOrig.begin(), inputTensorInfosOrig.end()); std::vector<T *> inputs = inputsOrig; armnn::TensorInfo outputTensorInfo = outputTensorInfoOrig; armnn::PermutationVector permuteVector{0, 1, 2}; - // hold and automatically release memory for the reshaped input data + // Holds and automatically releases memory for the reshaped input data. std::vector<std::vector<T>> tmpInputDataStorage; const size_t inputCount = inputTensorInfos.size(); @@ -1571,7 +1623,7 @@ void Concatenate(armnn::IWorkloadFactory& workloadFactory, { // // We need to permute the inputs, because concatenation along - // the requested axis is not supported + // the requested axis is not supported. // PermuteInputsForConcat<T>(workloadFactory, inputTensorInfos, @@ -2641,7 +2693,7 @@ LayerTestResult<float, 4> SimpleResizeBilinearTest(armnn::IWorkloadFactory& work // The 'resize bilinear' operation projects the top-left corner of output texels into the input image, // then figures out the interpolants and weights. Note this is different to projecting the centre of the - // output texel - and thus we'll expect the output 1x1 matrix to contain as its single element the value + // output texel - and thus we'll expect the output 1x1 matrix to contain, as its single element, the value // that was at position (0,0) of the input matrix (rather than an average, which we would expect if projecting // the centre). LayerTestResult<float, 4> result(outputTensorInfo); @@ -3367,12 +3419,12 @@ LayerTestResult<uint8_t, 3> MergerUint8Test(armnn::IWorkloadFactory& workloadFac unsigned int inputHeight2 = 6; unsigned int inputChannels2 = 1; - // Define the tensor descriptors + // Defines the tensor descriptors. armnn::TensorInfo outputTensorInfo({ outputChannels, outputHeight, outputWidth }, armnn::DataType::QuantisedAsymm8); armnn::TensorInfo inputTensorInfo1({ inputChannels1, inputHeight1, inputWidth1 }, armnn::DataType::QuantisedAsymm8); armnn::TensorInfo inputTensorInfo2({ inputChannels2, inputHeight2, inputWidth2 }, armnn::DataType::QuantisedAsymm8); - // Arbitrary scale and offsets. They don't really matter as the merger operator doesn't dequantize/quantize + // Arbitrary scale and offsets. They don't really matter as the merger operator doesn't dequantize/quantize them. const float scale = 0.13497836f; const int32_t offset = -7; @@ -3439,10 +3491,10 @@ LayerTestResult<uint8_t, 3> MergerUint8Test(armnn::IWorkloadFactory& workloadFac }) ); - std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //extent of the window is defined by size of input[0] + std::vector<unsigned int> wOrigin1 = { 0, 0, 0 }; //Extent of the window is defined by size of input[0]. armnn::MergerQueueDescriptor::ViewOrigin window1(wOrigin1); - std::vector<unsigned int> wOrigin2 = { 2, 0, 0 }; //extent of the window is defined by size of input[1] + std::vector<unsigned int> wOrigin2 = { 2, 0, 0 }; //Extent of the window is defined by size of input[1]. armnn::MergerQueueDescriptor::ViewOrigin window2(wOrigin2); @@ -3513,21 +3565,21 @@ LayerTestResult<uint8_t, 4> AdditionUint8Test(armnn::IWorkloadFactory& workloadF outputTensorInfo.SetQuantizationScale(scale); outputTensorInfo.SetQuantizationOffset(offset); - // See dequantized values to the right + // See dequantized values to the right. auto input1 = MakeTensor<uint8_t, 4>(inputTensorInfo1, std::vector<uint8_t>( { 63, 35, 77, 70, 56, 112, // 420, 224, 518, 469, 371, 763 203, 28, 252, 168, 245, 91 // 1400, 175, 1743, 1155, 1694, 616 })); - // See dequantized values to the right + // See dequantized values to the right. auto input2 = MakeTensor<uint8_t, 4>(inputTensorInfo1, std::vector<uint8_t>( { 21, 7, 175, 231, 175, 210, // 126, 28, 1204, 1596, 1204, 1449 126, 161, 63, 21, 105, 126 // 861, 1106, 420, 126, 714, 861 })); - // See dequantized values to the right + // See dequantized values to the right. LayerTestResult<uint8_t, 4> result(outputTensorInfo); result.outputExpected = MakeTensor<uint8_t, 4>(outputTensorInfo, std::vector<uint8_t>( { @@ -3633,19 +3685,19 @@ LayerTestResult<uint8_t, 4> MultiplicationUint8Test(armnn::IWorkloadFactory& wor unsigned int width = 3; const unsigned int shape[] = { batchSize, channels, height, width }; - // See dequantized values to the right + // See dequantized values to the right. std::vector<uint8_t> input0({ 62, 37, 3, 172, 13, 111, // 244, 144, 8, 684, 48, 440, 188, 20, 73, 31, 23, 31 // 748, 76, 288, 120, 88, 120 }); - // See dequantized values to the right + // See dequantized values to the right. std::vector<uint8_t> input1({ 126, 240, 252, 183, 121, 247, // 384, 726, 762, 555, 369, 747, 48, 115, 151, 79, 78, 97 // 150, 351, 459, 243, 240, 297 }); - // See dequantized values to the right + // See dequantized values to the right. std::vector<uint8_t> output( { 64, 72, 0, 255, 8, 236, // 93696, 104544, 6096(clamped), 379620(clamped), 17712, 328680, @@ -3663,7 +3715,7 @@ LayerTestResult<uint8_t, 4> MultiplicationUint8Test(armnn::IWorkloadFactory& wor -2, shape, output, - 1366.255f, // Scale/offset chosen to have output values out of range + 1366.255f, // Scale/offset chosen to have output values out of range. -5); } @@ -3813,7 +3865,7 @@ LayerTestResult<uint8_t, 4> SimpleResizeBilinearUint8Test(armnn::IWorkloadFactor // The 'resize bilinear' operation projects the top-left corner of output texels into the input image, // then figures out the interpolants and weights. Note this is different to projecting the centre of the - // output texel - and thus we'll expect the output 1x1 matrix to contain as its single element the value + // output texel - and thus we'll expect the output 1x1 matrix to contain, as its single element, the value // that was at position (0,0) of the input matrix (rather than an average, which we would expect if projecting // the centre). LayerTestResult<uint8_t, 4> result(outputTensorInfo); @@ -4314,4 +4366,4 @@ LayerTestResult<float, 4> PermuteFloat32ValueSet2Test(armnn::IWorkloadFactory& w LayerTestResult<float, 4> PermuteFloat32ValueSet3Test(armnn::IWorkloadFactory& workloadFactory) { return PermuteFloat32ValueSet3TestCommon(workloadFactory); -}; +};
\ No newline at end of file diff --git a/src/armnn/backends/test/LayerTests.hpp b/src/armnn/backends/test/LayerTests.hpp index 2d543d61de..48f73e7693 100644 --- a/src/armnn/backends/test/LayerTests.hpp +++ b/src/armnn/backends/test/LayerTests.hpp @@ -6,12 +6,13 @@ #include "armnn/ArmNN.hpp" #include "armnn/Tensor.hpp" +#include "Half.hpp" #include <boost/multi_array.hpp> #include <boost/assert.hpp> #include <array> -// Layer callables +// Layer callables. namespace armnn { @@ -213,20 +214,20 @@ LayerTestResult<float, 4> CompareBoundedReLuTest(armnn::IWorkloadFactory& worklo float upperBound, float lowerBound); -// Tests that the output should be identical to the input when the output dimensions match the input ones +// Tests that the output should be identical to the input when the output dimensions match the input ones. LayerTestResult<float, 4> ResizeBilinearNopTest(armnn::IWorkloadFactory& workloadFactory); -// Tests the behaviour of the resize bilinear operation when rescaling a 2x2 image into a 1x1 image +// Tests the behaviour of the resize bilinear operation when rescaling a 2x2 image into a 1x1 image. LayerTestResult<float, 4> SimpleResizeBilinearTest(armnn::IWorkloadFactory& workloadFactory); -// Tests resize bilinear for minification of a square input matrix (also: input dimensions are a -// multiple of output dimensions) +// Tests the resize bilinear for minification of a square input matrix (also: input dimensions are a +// multiple of output dimensions). LayerTestResult<float, 4> ResizeBilinearSqMinTest(armnn::IWorkloadFactory& workloadFactory); -// Tests resize bilinear for minification (output dimensions smaller than input dimensions) +// Tests the resize bilinear for minification (output dimensions smaller than input dimensions). LayerTestResult<float, 4> ResizeBilinearMinTest(armnn::IWorkloadFactory& workloadFactory); -// Tests resize bilinear for magnification (output dimensions bigger than input dimensions) +// Tests the resize bilinear for magnification (output dimensions bigger than input dimensions). LayerTestResult<float, 4> ResizeBilinearMagTest(armnn::IWorkloadFactory& workloadFactory); LayerTestResult<float, 4> BatchNormTest(armnn::IWorkloadFactory& workloadFactory); @@ -315,3 +316,13 @@ LayerTestResult<uint8_t, 4> SimplePermuteUint8Test(armnn::IWorkloadFactory& work LayerTestResult<float, 4> PermuteFloat32ValueSet1Test(armnn::IWorkloadFactory& workloadFactory); LayerTestResult<float, 4> PermuteFloat32ValueSet2Test(armnn::IWorkloadFactory& workloadFactory); LayerTestResult<float, 4> PermuteFloat32ValueSet3Test(armnn::IWorkloadFactory& workloadFactory); + +LayerTestResult<float, 2> LstmLayerFloat32WithCifgWithPeepholeNoProjectionTest + (armnn::IWorkloadFactory& workloadFactory); +LayerTestResult<float, 2> + LstmLayerFloat32NoCifgNoPeepholeNoProjectionTest(armnn::IWorkloadFactory& workloadFactory); +LayerTestResult<float, 2> +LstmLayerFloat32NoCifgWithPeepholeWithProjectionTest(armnn::IWorkloadFactory& workloadFactory); + +LayerTestResult<float, 4> SimpleConvertFp16ToFp32Test(armnn::IWorkloadFactory& workloadFactory); +LayerTestResult<armnn::Half, 4> SimpleConvertFp32ToFp16Test(armnn::IWorkloadFactory& workloadFactory); diff --git a/src/armnn/backends/test/LstmTestImpl.hpp b/src/armnn/backends/test/LstmTestImpl.hpp new file mode 100644 index 0000000000..7f67b020e2 --- /dev/null +++ b/src/armnn/backends/test/LstmTestImpl.hpp @@ -0,0 +1,1150 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include <armnn/ArmNN.hpp> +#include <armnn/Tensor.hpp> +#include <armnn/TypesUtils.hpp> + +#include "test/TensorHelpers.hpp" +#include "QuantizeHelper.hpp" + +#include "backends/CpuTensorHandle.hpp" +#include <backends/WorkloadInfo.hpp> +#include "backends/WorkloadFactory.hpp" + +LayerTestResult<float, 2> LstmNoCifgNoPeepholeNoProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory, + const boost::multi_array<float, 2>& input, + const boost::multi_array<float, 2>& outputExpected) +{ + unsigned int batchSize = boost::numeric_cast<unsigned int>(input.shape()[0]); + unsigned int inputSize = boost::numeric_cast<unsigned int>(input.shape()[1]); + unsigned int outputSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]); + // cellSize and outputSize have the same size when there is no projection. + unsigned numUnits = outputSize; + + + armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType<float>()); + armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>()); + + + armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>()); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>()); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>()); + + + LayerTestResult<float, 2> ret(outputTensorInfo); + + std::vector<float> inputVector; + inputVector.assign(input.data(), input.data() + (batchSize * inputSize)); + auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputVector); + + std::vector<float> cellStateInVector(batchSize * numUnits, 0.f); + auto cellStateInTensor = MakeTensor<float,2>(cellStateInTensorInfo, cellStateInVector); + + std::vector<float> outputStateInVector(batchSize * outputSize, 0.f); + auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector); + + std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f); + auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector); + + std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f); + auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector); + + std::vector<float> cellStateOutVector(batchSize * numUnits, 0.f); + auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector); + + std::vector<float> outputVector; + outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize)); + ret.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputVector); + + std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<armnn::ITensorHandle> cellStateInHandle = + workloadFactory.CreateTensorHandle(cellStateInTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputStateInHandle = + workloadFactory.CreateTensorHandle(outputStateInTensorInfo); + + std::unique_ptr<armnn::ITensorHandle> scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle = + workloadFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle = + workloadFactory.CreateTensorHandle(cellStateOutTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + + armnn::LstmQueueDescriptor data; + armnn::WorkloadInfo info; + + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); + AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + armnn::TensorInfo tensorInfo4({numUnits}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfo8({numUnits, 2}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfo16({numUnits, 4}, armnn::GetDataType<float>()); + + auto inputToInputWeights = MakeTensor<float, 2>(tensorInfo8, {-0.45018822f, -0.02338299f, -0.0870589f, + -0.34550029f, 0.04266912f, -0.15680569f, + -0.34856534f, 0.43890524f}); + + auto inputToForgetWeights = MakeTensor<float, 2>(tensorInfo8, {0.09701663f, 0.20334584f, -0.50592935f, + -0.31343272f, -0.40032279f, 0.44781327f, + 0.01387155f, -0.35593212f}); + + auto inputToCellWeights = MakeTensor<float, 2>(tensorInfo8, {-0.50013041f, 0.1370284f, 0.11810488f, 0.2013163f, + -0.20583314f, 0.44344562f, 0.22077113f, + -0.29909778f}); + + auto inputToOutputWeights = MakeTensor<float, 2>(tensorInfo8, {-0.25065863f, -0.28290087f, 0.04613829f, + 0.40525138f, 0.44272184f, 0.03897077f, + -0.1556896f, 0.19487578f}); + + auto recurrentToInputWeights = MakeTensor<float, 2>(tensorInfo16, {-0.0063535f, -0.2042388f, 0.31454784f, + -0.35746509f, 0.28902304f, 0.08183324f, + -0.16555229f, 0.02286911f, -0.13566875f, + 0.03034258f, 0.48091322f, -0.12528998f, + 0.24077177f, -0.51332325f, -0.33502164f, + 0.10629296f}); + + auto recurrentToForgetWeights = MakeTensor<float, 2>(tensorInfo16, {-0.48684245f, -0.06655136f, 0.42224967f, + 0.2112639f, 0.27654213f, 0.20864892f, + -0.07646349f, 0.45877004f, 0.00141793f, + -0.14609534f, 0.36447752f, 0.09196436f, + 0.28053468f, 0.01560611f, -0.20127171f, + -0.01140004f}); + + auto recurrentToCellWeights = MakeTensor<float, 2>(tensorInfo16, {-0.3407414f, 0.24443203f, -0.2078532f, + 0.26320225f, 0.05695659f, -0.00123841f, + -0.4744786f, -0.35869038f, -0.06418842f, + -0.13502428f, -0.501764f, 0.22830659f, + -0.46367589f, 0.26016325f, -0.03894562f, + -0.16368064f}); + + auto recurrentToOutputWeights = MakeTensor<float, 2>(tensorInfo16, {0.43385774f, -0.17194885f, 0.2718237f, + 0.09215671f, 0.24107647f, -0.39835793f, + 0.18212086f, 0.01301402f, 0.48572797f, + -0.50656658f, 0.20047462f, -0.20607421f, + -0.51818722f, -0.15390486f, 0.0468148f, + 0.39922136f}); + + auto cellToInputWeights = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.}); + + auto inputGateBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.}); + + auto forgetGateBias = MakeTensor<float, 1>(tensorInfo4, {1., 1., 1., 1.}); + + auto cellBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.}); + + auto outputGateBias = MakeTensor<float, 1>(tensorInfo4, {0., 0., 0., 0.}); + + armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo8); + armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo16); + armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo4); + armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo4); + + AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]); + AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]); + AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]); + AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]); + AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]); + + data.m_InputToInputWeights = &inputToInputWeightsTensor; + data.m_InputToForgetWeights = &inputToForgetWeightsTensor; + data.m_InputToCellWeights = &inputToCellWeightsTensor; + data.m_InputToOutputWeights = &inputToOutputWeightsTensor; + data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor; + data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor; + data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor; + data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor; + data.m_CellToInputWeights = &cellToInputWeightsTensor; + data.m_InputGateBias = &inputGateBiasTensor; + data.m_ForgetGateBias = &forgetGateBiasTensor; + data.m_CellBias = &cellBiasTensor; + data.m_OutputGateBias = &outputGateBiasTensor; + + + // Flags to set test configuration + data.m_Parameters.m_ActivationFunc = 4; + data.m_Parameters.m_CifgEnabled = false; + data.m_Parameters.m_PeepholeEnabled = false; + data.m_Parameters.m_ProjectionEnabled = false; + + + std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info); + inputHandle->Allocate(); + outputStateInHandle->Allocate(); + cellStateInHandle->Allocate(); + + scratchHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]); + CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]); + CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]); + + workloadFactory.Finalize(); + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get()); + + return ret; +} + + +LayerTestResult<float, 2> +LstmLayerFloat32NoCifgWithPeepholeWithProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory, + const boost::multi_array<float, 2>& input, + const boost::multi_array<float, 2>& outputExpected) { + + unsigned int batchSize = 2; + unsigned int outputSize = 16; + unsigned int inputSize = 5; + unsigned numUnits = 20; + + armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo cellStateInTensorInfo({batchSize , numUnits}, armnn::GetDataType<float>()); + armnn::TensorInfo outputStateInTensorInfo({batchSize , outputSize}, armnn::GetDataType<float>()); + + // Scratch buffer size without CIFG [batchSize, numUnits * 3] + armnn::TensorInfo scratchBufferTensorInfo({batchSize, numUnits * 3}, armnn::GetDataType<float>()); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, numUnits}, armnn::GetDataType<float>()); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>()); + + LayerTestResult<float, 2> ret(outputTensorInfo); + + std::vector<float> inputVector; + inputVector.assign(input.data(), input.data() + (batchSize * inputSize)); + auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputVector); + + std::vector<float> cellStateInVector(batchSize * numUnits, 0.f); + auto cellStateInTensor = MakeTensor<float,2>(cellStateInTensorInfo, cellStateInVector); + + std::vector<float> outputStateInVector(batchSize * outputSize, 0.f); + auto outputStateInTensor = MakeTensor<float,2>(outputStateInTensorInfo, outputStateInVector); + + std::vector<float> scratchBufferVector(batchSize * numUnits * 3, 0.f); + auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector); + + std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f); + auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector); + + std::vector<float> cellStateOutVector(batchSize * numUnits, 0.f); + auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector); + + std::vector<float> outputVector; + outputVector.assign(outputExpected.data(), outputExpected.data() + (batchSize * outputSize)); + ret.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputVector); + + std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<armnn::ITensorHandle> cellStateInHandle = + workloadFactory.CreateTensorHandle(cellStateInTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputStateInHandle = + workloadFactory.CreateTensorHandle(outputStateInTensorInfo); + + std::unique_ptr<armnn::ITensorHandle> scratchHandle = workloadFactory.CreateTensorHandle(scratchBufferTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle = + workloadFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle = + workloadFactory.CreateTensorHandle(cellStateOutTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::LstmQueueDescriptor data; + armnn::WorkloadInfo info; + + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); + AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + armnn::TensorInfo tensorInfo16({outputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfo20({numUnits}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfo20x5({numUnits, inputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfo20x16({numUnits, outputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfo16x20({outputSize, numUnits}, armnn::GetDataType<float>()); + + auto inputToInputWeights = + MakeTensor<float, 2>(tensorInfo20x5, {0.021393683f,0.06124551f, 0.046905167f,-0.014657677f,-0.03149463f, + 0.09171803f, 0.14647801f,0.10797193f, -0.0057968358f,0.0019193048f, + -0.2726754f, 0.10154029f, -0.018539885f, 0.080349885f, -0.10262385f, + -0.022599787f,-0.09121155f, -0.008675967f, -0.045206103f,-0.0821282f, + -0.008045952f,0.015478081f, 0.055217247f, 0.038719587f, 0.044153627f, + -0.06453243f,0.05031825f, -0.046935108f, -0.008164439f, 0.014574226f, + -0.1671009f, -0.15519552f, -0.16819797f,-0.13971269f,-0.11953059f, + 0.25005487f, -0.22790983f, 0.009855087f, -0.028140958f, -0.11200698f, + 0.11295408f, -0.0035217577f, 0.054485075f, 0.05184695f, 0.064711206f, + 0.10989193f, 0.11674786f, 0.03490607f, 0.07727357f, 0.11390585f, + -0.1863375f, -0.1034451f, -0.13945189f, -0.049401227f, -0.18767063f, + 0.042483903f, 0.14233552f, 0.13832581f, 0.18350165f, 0.14545603f, + -0.028545704f,0.024939531f,0.050929718f,0.0076203286f,-0.0029723682f, + -0.042484224f, -0.11827596f, -0.09171104f, -0.10808628f,-0.16327988f, + -0.2273378f, -0.0993647f, -0.017155107f,0.0023917493f,0.049272764f, + 0.0038534778f, 0.054764505f, 0.089753784f, 0.06947234f, 0.08014476f, + -0.04544234f, -0.0497073f,-0.07135631f, -0.048929106f,-0.004042012f, + -0.009284026f, 0.018042054f, 0.0036860977f,-0.07427302f, -0.11434604f, + -0.018995456f, 0.031487543f, 0.012834908f,0.019977754f,0.044256654f, + -0.39292613f, -0.18519334f, -0.11651281f,-0.06809892f, 0.011373677f + }); + + auto inputToForgetWeights = + MakeTensor<float, 2>(tensorInfo20x5, {-0.0018401089f, -0.004852237f,0.03698424f, 0.014181704f,0.028273236f, + -0.016726194f, -0.05249759f,-0.10204261f, 0.00861066f,-0.040979505f, + -0.009899187f,0.01923892f,-0.028177269f, -0.08535103f,-0.14585495f, + 0.10662567f,-0.01909731f,-0.017883534f,-0.0047269356f,-0.045103323f, + 0.0030784295f,0.076784775f,0.07463696f, 0.094531395f,0.0814421f, + -0.12257899f, -0.033945758f,-0.031303465f, 0.045630626f,0.06843887f, + -0.13492945f, -0.012480007f,-0.0811829f, -0.07224499f,-0.09628791f, + 0.045100946f,0.0012300825f, 0.013964662f, 0.099372394f,0.02543059f, + 0.06958324f, 0.034257296f, 0.0482646f, 0.06267997f,0.052625068f, + 0.12784666f, 0.07077897f, 0.025725935f, 0.04165009f,0.07241905f, + 0.018668644f, -0.037377294f,-0.06277783f,-0.08833636f,-0.040120605f, + -0.011405586f,-0.007808335f,-0.010301386f,-0.005102167f,0.027717464f, + 0.05483423f, 0.11449111f, 0.11289652f,0.10939839f, 0.13396506f, + -0.08402166f,-0.01901462f, -0.044678304f,-0.07720565f,0.014350063f, + -0.11757958f, -0.0652038f, -0.08185733f,-0.076754324f,-0.092614375f, + 0.10405491f, 0.052960336f, 0.035755895f,0.035839386f,-0.012540553f, + 0.036881298f, 0.02913376f, 0.03420159f,0.05448447f,-0.054523353f, + 0.02582715f, 0.02327355f, -0.011857179f,-0.0011980024f,-0.034641717f, + -0.026125094f,-0.17582615f,-0.15923657f,-0.27486774f,-0.0006143371f, + 0.0001771948f, -8.470171e-05f, 0.02651807f,0.045790765f,0.06956496f + }); + + auto inputToCellWeights = + MakeTensor<float, 2>(tensorInfo20x5, {-0.04580283f, -0.09549462f, -0.032418985f, -0.06454633f, + -0.043528453f, 0.043018587f, -0.049152344f, -0.12418144f, + -0.078985475f, -0.07596889f, 0.019484362f, -0.11434962f, + -0.0074034138f, -0.06314844f, -0.092981495f, 0.0062155537f, + -0.025034338f, -0.0028890965f, 0.048929527f, 0.06235075f, + 0.10665918f, -0.032036792f, -0.08505916f, -0.10843358f, + -0.13002433f, -0.036816437f, -0.02130134f, -0.016518239f, + 0.0047691227f, -0.0025825808f, 0.066017866f, 0.029991534f, + -0.10652836f, -0.1037554f, -0.13056071f, -0.03266643f, + -0.033702414f, -0.006473424f, -0.04611692f, 0.014419339f, + -0.025174323f, 0.0396852f, 0.081777506f, 0.06157468f, + 0.10210095f, -0.009658194f, 0.046511717f, 0.03603906f, + 0.0069369148f, 0.015960095f, -0.06507666f, 0.09551598f, + 0.053568836f, 0.06408714f, 0.12835667f, -0.008714329f, + -0.20211966f, -0.12093674f, 0.029450472f, 0.2849013f, + -0.029227901f, 0.1164364f, -0.08560263f, 0.09941786f, + -0.036999565f, -0.028842626f, -0.0033637602f, -0.017012902f, + -0.09720865f, -0.11193351f, -0.029155117f, -0.017936034f, + -0.009768936f, -0.04223324f, -0.036159635f, 0.06505112f, + -0.021742892f, -0.023377212f, -0.07221364f, -0.06430552f, + 0.05453865f, 0.091149814f, 0.06387331f, 0.007518393f, + 0.055960953f, 0.069779344f, 0.046411168f, 0.10509911f, + 0.07463894f, 0.0075130584f, 0.012850982f, 0.04555431f, + 0.056955688f, 0.06555285f, 0.050801456f, -0.009862683f, + 0.00826772f, -0.026555609f, -0.0073611983f, -0.0014897042f + }); + + auto inputToOutputWeights = + MakeTensor<float, 2>(tensorInfo20x5, {-0.0998932f, -0.07201956f, -0.052803773f,-0.15629593f,-0.15001918f, + -0.07650751f,0.02359855f, -0.075155355f, -0.08037709f, -0.15093534f, + 0.029517552f, -0.04751393f, 0.010350531f,-0.02664851f, -0.016839722f, + -0.023121163f, 0.0077019283f, 0.012851257f, -0.05040649f,-0.0129761f, + -0.021737747f,-0.038305793f,-0.06870586f, -0.01481247f,-0.001285394f, + 0.10124236f, 0.083122835f, 0.053313006f,-0.062235646f,-0.075637154f, + -0.027833903f, 0.029774971f, 0.1130802f, 0.09218906f, 0.09506135f, + -0.086665764f,-0.037162706f,-0.038880914f,-0.035832845f,-0.014481564f, + -0.09825003f,-0.12048569f,-0.097665586f,-0.05287633f, -0.0964047f, + -0.11366429f, 0.035777505f, 0.13568819f, 0.052451383f,0.050649304f, + 0.05798951f, -0.021852335f,-0.099848844f,0.014740475f,-0.078897946f, + 0.04974699f, 0.014160473f, 0.06973932f, 0.04964942f, 0.033364646f, + 0.08190124f, 0.025535367f, 0.050893165f, 0.048514254f,0.06945813f, + -0.078907564f,-0.06707616f, -0.11844508f, -0.09986688f,-0.07509403f, + 0.06263226f, 0.14925587f, 0.20188436f, 0.12098451f,0.14639415f, + 0.0015017595f, -0.014267382f, -0.03417257f,0.012711468f,0.0028300495f, + -0.024758482f, -0.05098548f,-0.0821182f, 0.014225672f, 0.021544158f, + 0.08949725f, 0.07505268f, -0.0020780868f, 0.04908258f,0.06476295f, + -0.022907063f,0.027562456f,0.040185735f, 0.019567577f,-0.015598739f, + -0.049097303f, -0.017121866f, -0.083368234f,-0.02332002f,-0.0840956f + }); + + auto inputGateBias = + MakeTensor<float, 1>(tensorInfo20, {0.02234832f, 0.14757581f, 0.18176508f, 0.10380666f, 0.053110216f, + -0.06928846f, -0.13942584f, -0.11816189f, 0.19483899f, 0.03652339f, + -0.10250295f, 0.036714908f, -0.18426876f, 0.036065217f, 0.21810818f, + 0.02383196f, -0.043370757f, 0.08690144f, -0.04444982f, 0.00030581196f + }); + + auto forgetGateBias = + MakeTensor<float, 1>(tensorInfo20, {0.035185695f, -0.042891346f, -0.03032477f, 0.23027696f, + 0.11098921f, 0.15378423f, 0.09263801f, 0.09790885f, + 0.09508917f, 0.061199076f, 0.07665568f, -0.015443159f, + -0.03499149f, 0.046190713f, 0.08895977f, 0.10899629f, + 0.40694186f, 0.06030037f, 0.012413437f, -0.06108739f + }); + + auto cellBias = + MakeTensor<float, 1>(tensorInfo20, {-0.024379363f, 0.0055531194f, 0.23377132f, 0.033463873f, + -0.1483596f, -0.10639995f, -0.091433935f, 0.058573797f, + -0.06809782f, -0.07889636f, -0.043246906f, -0.09829136f, + -0.4279842f, 0.034901652f, 0.18797937f, 0.0075234566f, + 0.016178843f, 0.1749513f, 0.13975595f, 0.92058027f + }); + + auto outputGateBias = + MakeTensor<float, 1>(tensorInfo20, {0.046159424f, -0.0012809046f, 0.03563469f, 0.12648113f, 0.027195795f, + 0.35373217f, -0.018957434f, 0.008907322f, -0.0762701f, 0.12018895f, + 0.04216877f, 0.0022856654f, 0.040952638f, 0.3147856f, 0.08225149f, + -0.057416286f, -0.14995944f, -0.008040261f, 0.13208859f, 0.029760877f + }); + + auto recurrentToInputWeights = + MakeTensor<float, 2>(tensorInfo20x16, {-0.001374326f, -0.078856036f, 0.10672688f, 0.029162422f, + -0.11585556f, 0.02557986f, -0.13446963f, -0.035785314f, + -0.01244275f, 0.025961924f, -0.02337298f, -0.044228926f, + -0.055839065f, -0.046598054f, -0.010546039f, -0.06900766f, + 0.027239809f, 0.022582639f, -0.013296484f, -0.05459212f, + 0.08981f, -0.045407712f, 0.08682226f, -0.06867011f, + -0.14390695f, -0.02916037f, 0.000996957f, 0.091420636f, + 0.14283475f, -0.07390571f, -0.06402044f, 0.062524505f, + -0.093129106f, 0.04860203f, -0.08364217f, -0.08119002f, + 0.009352075f, 0.22920375f, 0.0016303885f, 0.11583097f, + -0.13732095f, 0.012405723f, -0.07551853f, 0.06343048f, + 0.12162708f, -0.031923793f, -0.014335606f, 0.01790974f, + -0.10650317f, -0.0724401f, 0.08554849f, -0.05727212f, + 0.06556731f, -0.042729504f, -0.043227166f, 0.011683251f, + -0.013082158f, -0.029302018f, -0.010899579f, -0.062036745f, + -0.022509435f, -0.00964907f, -0.01567329f, 0.04260106f, + -0.07787477f, -0.11576462f, 0.017356863f, 0.048673786f, + -0.017577527f, -0.05527947f, -0.082487635f, -0.040137455f, + -0.10820036f, -0.04666372f, 0.022746278f, -0.07851417f, + 0.01068115f, 0.032956902f, 0.022433773f, 0.0026891115f, + 0.08944216f, -0.0685835f, 0.010513544f, 0.07228705f, + 0.02032331f, -0.059686817f, -0.0005566496f, -0.086984694f, + 0.040414046f, -0.1380399f, 0.094208956f, -0.05722982f, + 0.012092817f, -0.04989123f, -0.086576f, -0.003399834f, + -0.04696032f, -0.045747425f, 0.10091314f, 0.048676282f, + -0.029037097f, 0.031399418f, -0.0040285117f, 0.047237843f, + 0.09504992f, 0.041799378f, -0.049185462f, -0.031518843f, + -0.10516937f, 0.026374253f, 0.10058866f, -0.0033195973f, + -0.041975245f, 0.0073591834f, 0.0033782164f, -0.004325073f, + -0.10167381f, 0.042500053f, -0.01447153f, 0.06464186f, + -0.017142897f, 0.03312627f, 0.009205989f, 0.024138335f, + -0.011337001f, 0.035530265f, -0.010912711f, 0.0706555f, + -0.005894094f, 0.051841937f, -0.1401738f, -0.02351249f, + 0.0365468f, 0.07590991f, 0.08838724f, 0.021681072f, + -0.10086113f, 0.019608743f, -0.06195883f, 0.077335775f, + 0.023646897f, -0.095322326f, 0.02233014f, 0.09756986f, + -0.048691444f, -0.009579111f, 0.07595467f, 0.11480546f, + -0.09801813f, 0.019894179f, 0.08502348f, 0.004032281f, + 0.037211012f, 0.068537936f, -0.048005626f, -0.091520436f, + -0.028379958f, -0.01556313f, 0.06554592f, -0.045599163f, + -0.01672207f, -0.020169014f, -0.011877351f, -0.20212261f, + 0.010889619f, 0.0047078193f, 0.038385306f, 0.08540671f, + -0.017140968f, -0.0035865551f, 0.016678626f, 0.005633034f, + 0.015963363f, 0.00871737f, 0.060130805f, 0.028611384f, + 0.10109069f, -0.015060172f, -0.07894427f, 0.06401885f, + 0.011584063f, -0.024466386f, 0.0047652307f, -0.09041358f, + 0.030737216f, -0.0046374933f, 0.14215417f, -0.11823516f, + 0.019899689f, 0.006106124f, -0.027092824f, 0.0786356f, + 0.05052217f, -0.058925f, -0.011402121f, -0.024987547f, + -0.0013661642f, -0.06832946f, -0.015667673f, -0.1083353f, + -0.00096863037f, -0.06988685f, -0.053350925f, -0.027275559f, + -0.033664223f, -0.07978348f, -0.025200296f, -0.017207067f, + -0.058403496f, -0.055697463f, 0.005798788f, 0.12965427f, + -0.062582195f, 0.0013350133f, -0.10482091f, 0.0379771f, + 0.072521195f, -0.0029455067f, -0.13797039f, -0.03628521f, + 0.013806405f, -0.017858358f, -0.01008298f, -0.07700066f, + -0.017081132f, 0.019358726f, 0.0027079724f, 0.004635139f, + 0.062634714f, -0.02338735f, -0.039547626f, -0.02050681f, + 0.03385117f, -0.083611414f, 0.002862572f, -0.09421313f, + 0.058618143f, -0.08598433f, 0.00972939f, 0.023867095f, + -0.053934585f, -0.023203006f, 0.07452513f, -0.048767887f, + -0.07314807f, -0.056307215f, -0.10433547f, -0.06440842f, + 0.04328182f, 0.04389765f, -0.020006588f, -0.09076438f, + -0.11652589f, -0.021705797f, 0.03345259f, -0.010329105f, + -0.025767034f, 0.013057034f, -0.07316461f, -0.10145612f, + 0.06358255f, 0.18531723f, 0.07759293f, 0.12006465f, + 0.1305557f, 0.058638252f, -0.03393652f, 0.09622831f, + -0.16253184f, -2.4580743e-06f, 0.079869635f, -0.070196845f, + -0.005644518f, 0.06857898f, -0.12598175f, -0.035084512f, + 0.03156317f, -0.12794146f, -0.031963028f, 0.04692781f, + 0.030070418f, 0.0071660685f, -0.095516115f, -0.004643372f, + 0.040170413f, -0.062104587f, -0.0037324072f, 0.0554317f, + 0.08184801f, -0.019164372f, 0.06791302f, 0.034257166f, + -0.10307039f, 0.021943003f, 0.046745934f, 0.0790918f, + -0.0265588f, -0.007824208f, 0.042546265f, -0.00977924f, + -0.0002440307f, -0.017384544f, -0.017990116f, 0.12252321f, + -0.014512694f, -0.08251313f, 0.08861942f, 0.13589665f, + 0.026351685f, 0.012641483f, 0.07466548f, 0.044301085f, + -0.045414884f, -0.051112458f, 0.03444247f, -0.08502782f, + -0.04106223f, -0.028126027f, 0.028473156f, 0.10467447f + }); + + auto recurrentToForgetWeights = + MakeTensor<float, 2>(tensorInfo20x16, {-0.057784554f, -0.026057621f, -0.068447545f, -0.022581743f, + 0.14811787f, 0.10826372f, 0.09471067f, 0.03987225f, + -0.0039523416f, 0.00030638507f, 0.053185795f, 0.10572994f, + 0.08414449f, -0.022036452f, -0.00066928595f, -0.09203576f, + 0.032950465f, -0.10985798f, -0.023809856f, 0.0021431844f, + -0.02196096f, -0.00326074f, 0.00058621005f, -0.074678116f, + -0.06193199f, 0.055729095f, 0.03736828f, 0.020123724f, + 0.061878487f, -0.04729229f, 0.034919553f, -0.07585433f, + -0.04421272f, -0.044019096f, 0.085488975f, 0.04058006f, + -0.06890133f, -0.030951202f, -0.024628663f, -0.07672815f, + 0.034293607f, 0.08556707f, -0.05293577f, -0.033561368f, + -0.04899627f, 0.0241671f, 0.015736353f, -0.095442444f, + -0.029564252f, 0.016493602f, -0.035026584f, 0.022337519f, + -0.026871363f, 0.004780428f, 0.0077918363f, -0.03601621f, + 0.016435321f, -0.03263031f, -0.09543275f, -0.047392778f, + 0.013454138f, 0.028934088f, 0.01685226f, -0.086110644f, + -0.046250615f, -0.01847454f, 0.047608484f, 0.07339695f, + 0.034546845f, -0.04881143f, 0.009128804f, -0.08802852f, + 0.03761666f, 0.008096139f, -0.014454086f, 0.014361001f, + -0.023502491f, -0.0011840804f, -0.07607001f, 0.001856849f, + -0.06509276f, -0.006021153f, -0.08570962f, -0.1451793f, + 0.060212336f, 0.055259194f, 0.06974018f, 0.049454916f, + -0.027794661f, -0.08077226f, -0.016179763f, 0.1169753f, + 0.17213494f, -0.0056326236f, -0.053934924f, -0.0124349f, + -0.11520337f, 0.05409887f, 0.088759385f, 0.0019655675f, + 0.0042065294f, 0.03881498f, 0.019844765f, 0.041858196f, + -0.05695512f, 0.047233116f, 0.038937137f, -0.06542224f, + 0.014429736f, -0.09719407f, 0.13908425f, -0.05379757f, + 0.012321099f, 0.082840554f, -0.029899208f, 0.044217527f, + 0.059855383f, 0.07711018f, -0.045319796f, 0.0948846f, + -0.011724666f, -0.0033288454f, -0.033542685f, -0.04764985f, + -0.13873616f, 0.040668588f, 0.034832682f, -0.015319203f, + -0.018715994f, 0.046002675f, 0.0599172f, -0.043107376f, + 0.0294216f, -0.002314414f, -0.022424703f, 0.0030315618f, + 0.0014641669f, 0.0029166266f, -0.11878115f, 0.013738511f, + 0.12375372f, -0.0006038222f, 0.029104086f, 0.087442465f, + 0.052958444f, 0.07558703f, 0.04817258f, 0.044462286f, + -0.015213451f, -0.08783778f, -0.0561384f, -0.003008196f, + 0.047060397f, -0.002058388f, 0.03429439f, -0.018839769f, + 0.024734668f, 0.024614193f, -0.042046934f, 0.09597743f, + -0.0043254104f, 0.04320769f, 0.0064070094f, -0.0019131786f, + -0.02558259f, -0.022822596f, -0.023273505f, -0.02464396f, + -0.10991725f, -0.006240552f, 0.0074488563f, 0.024044557f, + 0.04383914f, -0.046476185f, 0.028658995f, 0.060410924f, + 0.050786525f, 0.009452605f, -0.0073054377f, -0.024810238f, + 0.0052906186f, 0.0066939713f, -0.0020913032f, 0.014515517f, + 0.015898481f, 0.021362653f, -0.030262267f, 0.016587038f, + -0.011442813f, 0.041154444f, -0.007631438f, -0.03423484f, + -0.010977775f, 0.036152758f, 0.0066366293f, 0.11915515f, + 0.02318443f, -0.041350313f, 0.021485701f, -0.10906167f, + -0.028218046f, -0.00954771f, 0.020531068f, -0.11995105f, + -0.03672871f, 0.024019798f, 0.014255957f, -0.05221243f, + -0.00661567f, -0.04630967f, 0.033188973f, 0.10107534f, + -0.014027541f, 0.030796422f, -0.10270911f, -0.035999842f, + 0.15443139f, 0.07684145f, 0.036571592f, -0.035900835f, + -0.0034699554f, 0.06209149f, 0.015920248f, -0.031122351f, + -0.03858649f, 0.01849943f, 0.13872518f, 0.01503974f, + 0.069941424f, -0.06948533f, -0.0088794185f, 0.061282158f, + -0.047401894f, 0.03100163f, -0.041533746f, -0.10430945f, + 0.044574402f, -0.01425562f, -0.024290353f, 0.034563623f, + 0.05866852f, 0.023947537f, -0.09445152f, 0.035450947f, + 0.02247216f, -0.0042998926f, 0.061146557f, -0.10250651f, + 0.020881841f, -0.06747029f, 0.10062043f, -0.0023941975f, + 0.03532124f, -0.016341697f, 0.09685456f, -0.016764693f, + 0.051808182f, 0.05875331f, -0.04536488f, 0.001626336f, + -0.028892258f, -0.01048663f, -0.009793449f, -0.017093895f, + 0.010987891f, 0.02357273f, -0.00010856845f, 0.0099760275f, + -0.001845119f, -0.03551521f, 0.0018358806f, 0.05763657f, + -0.01769146f, 0.040995963f, 0.02235177f, -0.060430344f, + 0.11475477f, -0.023854522f, 0.10071741f, 0.0686208f, + -0.014250481f, 0.034261297f, 0.047418304f, 0.08562733f, + -0.030519066f, 0.0060542435f, 0.014653856f, -0.038836084f, + 0.04096551f, 0.032249358f, -0.08355519f, -0.026823482f, + 0.056386515f, -0.010401743f, -0.028396193f, 0.08507674f, + 0.014410365f, 0.020995233f, 0.17040324f, 0.11511526f, + 0.02459721f, 0.0066619175f, 0.025853224f, -0.023133837f, + -0.081302024f, 0.017264642f, -0.009585969f, 0.09491168f, + -0.051313367f, 0.054532815f, -0.014298593f, 0.10657464f, + 0.007076659f, 0.10964551f, 0.0409152f, 0.008275321f, + -0.07283536f, 0.07937492f, 0.04192024f, -0.1075027f + }); + + auto recurrentToCellWeights = + MakeTensor<float, 2>(tensorInfo20x16, {-0.037322544f, 0.018592842f, 0.0056175636f, -0.06253426f, + 0.055647098f, -0.05713207f, -0.05626563f, 0.005559383f, + 0.03375411f, -0.025757805f, -0.088049285f, 0.06017052f, + -0.06570978f, 0.007384076f, 0.035123326f, -0.07920549f, + 0.053676967f, 0.044480428f, -0.07663568f, 0.0071805613f, + 0.08089997f, 0.05143358f, 0.038261272f, 0.03339287f, + -0.027673481f, 0.044746667f, 0.028349208f, 0.020090483f, + -0.019443132f, -0.030755889f, -0.0040000007f, 0.04465846f, + -0.021585021f, 0.0031670958f, 0.0053199246f, -0.056117613f, + -0.10893326f, 0.076739706f, -0.08509834f, -0.027997585f, + 0.037871376f, 0.01449768f, -0.09002357f, -0.06111149f, + -0.046195522f, 0.0422062f, -0.005683705f, -0.1253618f, + -0.012925729f, -0.04890792f, 0.06985068f, 0.037654128f, + 0.03398274f, -0.004781977f, 0.007032333f, -0.031787455f, + 0.010868644f, -0.031489216f, 0.09525667f, 0.013939797f, + 0.0058680447f, 0.0167067f, 0.02668468f, -0.04797466f, + -0.048885044f, -0.12722108f, 0.035304096f, 0.06554885f, + 0.00972396f, -0.039238118f, -0.05159735f, -0.11329045f, + 0.1613692f, -0.03750952f, 0.06529313f, -0.071974665f, + -0.11769596f, 0.015524369f, -0.0013754242f, -0.12446318f, + 0.02786344f, -0.014179351f, 0.005264273f, 0.14376344f, + 0.015983658f, 0.03406988f, -0.06939408f, 0.040699873f, + 0.02111075f, 0.09669095f, 0.041345075f, -0.08316494f, + -0.07684199f, -0.045768797f, 0.032298047f, -0.041805092f, + 0.0119405f, 0.0061010392f, 0.12652606f, 0.0064572375f, + -0.024950314f, 0.11574242f, 0.04508852f, -0.04335324f, + 0.06760663f, -0.027437469f, 0.07216407f, 0.06977076f, + -0.05438599f, 0.034033038f, -0.028602652f, 0.05346137f, + 0.043184172f, -0.037189785f, 0.10420091f, 0.00882477f, + -0.054019816f, -0.074273005f, -0.030617684f, -0.0028467078f, + 0.024302477f, -0.0038869337f, 0.005332455f, 0.0013399826f, + 0.04361412f, -0.007001822f, 0.09631092f, -0.06702025f, + -0.042049985f, -0.035070654f, -0.04103342f, -0.10273396f, + 0.0544271f, 0.037184782f, -0.13150354f, -0.0058036847f, + -0.008264958f, 0.042035464f, 0.05891794f, 0.029673764f, + 0.0063542654f, 0.044788733f, 0.054816857f, 0.062257513f, + -0.00093483756f, 0.048938446f, -0.004952862f, -0.007730018f, + -0.04043371f, -0.017094059f, 0.07229206f, -0.023670016f, + -0.052195564f, -0.025616996f, -0.01520939f, 0.045104615f, + -0.007376126f, 0.003533447f, 0.006570588f, 0.056037236f, + 0.12436656f, 0.051817212f, 0.028532185f, -0.08686856f, + 0.11868599f, 0.07663395f, -0.07323171f, 0.03463402f, + -0.050708205f, -0.04458982f, -0.11590894f, 0.021273347f, + 0.1251325f, -0.15313013f, -0.12224372f, 0.17228661f, + 0.023029093f, 0.086124025f, 0.006445803f, -0.03496501f, + 0.028332196f, 0.04449512f, -0.042436164f, -0.026587414f, + -0.006041347f, -0.09292539f, -0.05678812f, 0.03897832f, + 0.09465633f, 0.008115513f, -0.02171956f, 0.08304309f, + 0.071401566f, 0.019622514f, 0.032163795f, -0.004167056f, + 0.02295182f, 0.030739572f, 0.056506045f, 0.004612461f, + 0.06524936f, 0.059999723f, 0.046395954f, -0.0045512207f, + -0.1335546f, -0.030136576f, 0.11584653f, -0.014678886f, + 0.0020118146f, -0.09688814f, -0.0790206f, 0.039770417f, + -0.0329582f, 0.07922767f, 0.029322514f, 0.026405897f, + 0.04207835f, -0.07073373f, 0.063781224f, 0.0859677f, + -0.10925287f, -0.07011058f, 0.048005477f, 0.03438226f, + -0.09606514f, -0.006669445f, -0.043381985f, 0.04240257f, + -0.06955775f, -0.06769346f, 0.043903265f, -0.026784198f, + -0.017840602f, 0.024307009f, -0.040079936f, -0.019946516f, + 0.045318738f, -0.12233574f, 0.026170589f, 0.0074471775f, + 0.15978073f, 0.10185836f, 0.10298046f, -0.015476589f, + -0.039390966f, -0.072174534f, 0.0739445f, -0.1211869f, + -0.0347889f, -0.07943156f, 0.014809798f, -0.12412325f, + -0.0030663363f, 0.039695457f, 0.0647603f, -0.08291318f, + -0.018529687f, -0.004423833f, 0.0037507233f, 0.084633216f, + -0.01514876f, -0.056505352f, -0.012800942f, -0.06994386f, + 0.012962922f, -0.031234352f, 0.07029052f, 0.016418684f, + 0.03618972f, 0.055686004f, -0.08663945f, -0.017404709f, + -0.054761406f, 0.029065743f, 0.052404847f, 0.020238016f, + 0.0048197987f, -0.0214882f, 0.07078733f, 0.013016777f, + 0.06262858f, 0.009184685f, 0.020785125f, -0.043904778f, + -0.0270329f, -0.03299152f, -0.060088247f, -0.015162964f, + -0.001828936f, 0.12642565f, -0.056757294f, 0.013586685f, + 0.09232601f, -0.035886683f, 0.06000002f, 0.05229691f, + -0.052580316f, -0.082029596f, -0.010794592f, 0.012947712f, + -0.036429964f, -0.085508935f, -0.13127148f, -0.017744139f, + 0.031502828f, 0.036232427f, -0.031581745f, 0.023051167f, + -0.05325106f, -0.03421577f, 0.028793324f, -0.034633752f, + -0.009881397f, -0.043551125f, -0.018609839f, 0.0019097115f, + -0.008799762f, 0.056595087f, 0.0022273948f, 0.055752404f + }); + + auto recurrentToOutputWeights = + MakeTensor<float, 2>(tensorInfo20x16, {0.025825322f, -0.05813119f, 0.09495884f,-0.045984812f, -0.01255415f, + -0.0026479573f,-0.08196161f,-0.054914974f,-0.0046604523f, + -0.029587349f, -0.044576716f, -0.07480124f, -0.082868785f, + 0.023254942f, 0.027502948f, -0.0039728214f, -0.08683098f, + -0.08116779f, -0.014675607f, -0.037924774f, -0.023314456f, + -0.007401714f, -0.09255757f, 0.029460307f, -0.08829125f, + -0.005139627f, -0.08989442f, -0.0555066f, 0.13596267f, + -0.025062224f, -0.048351806f, -0.03850004f, 0.07266485f, + -0.022414139f, 0.05940088f, 0.075114764f, 0.09597592f, + -0.010211725f, -0.0049794707f, -0.011523867f, -0.025980417f, + 0.072999895f, 0.11091378f, -0.081685916f, 0.014416728f, + 0.043229222f, 0.034178585f, -0.07530371f, 0.035837382f, + -0.085607f, -0.007721233f, -0.03287832f, -0.043848954f, + -0.06404588f, -0.06632928f, -0.073643476f, 0.008214239f, + -0.045984086f, 0.039764922f, 0.03474462f, 0.060612556f, + -0.080590084f, 0.049127717f, 0.04151091f, -0.030063879f, + 0.008801774f, -0.023021035f, -0.019558564f, 0.05158114f, + -0.010947698f, -0.011825728f, 0.0075720972f, 0.0699727f, + -0.0039981045f, 0.069350146f, 0.08799282f, 0.016156472f, + 0.035502106f, 0.11695009f, 0.006217345f, 0.13392477f, + -0.037875112f, 0.025745004f, 0.08940699f, -0.00924166f, + 0.0046702605f, -0.036598757f, -0.08811812f, 0.10522024f, + -0.032441203f, 0.008176899f, -0.04454919f, 0.07058152f, + 0.0067963637f, 0.039206743f, 0.03259838f, 0.03725492f, + -0.09515802f, 0.013326398f, -0.052055415f, -0.025676316f, + 0.03198509f, -0.015951829f, -0.058556724f, 0.036879618f, + 0.043357447f, 0.028362012f, -0.05908629f, 0.0059240665f, + -0.04995891f, -0.019187413f,0.0276265f, -0.01628143f, 0.0025863599f, + 0.08800015f, 0.035250366f, -0.022165963f, -0.07328642f, + -0.009415526f, -0.07455109f, 0.11690406f, 0.0363299f, + 0.07411125f, 0.042103454f, -0.009660886f, 0.019076364f, + 0.018299393f, -0.046004917f, 0.08891175f,0.0431396f, -0.026327137f, + -0.051502608f, 0.08979574f, -0.051670972f, 0.04940282f, + -0.07491107f, -0.021240504f, 0.022596184f, -0.034280192f, + 0.060163025f, -0.058211457f, -0.051837247f, -0.01349775f, + -0.04639988f, -0.035936575f, -0.011681591f, 0.064818054f, + 0.0073146066f, -0.021745546f, -0.043124277f, -0.06471268f, + -0.07053354f, -0.029321948f, -0.05330136f, 0.016933719f, + -0.053782392f, 0.13747959f, -0.1361751f, -0.11569455f, + 0.0033329215f, 0.05693899f, -0.053219706f, 0.063698f, + 0.07977434f, -0.07924483f, 0.06936997f, 0.0034815092f, + -0.007305279f, -0.037325785f, -0.07251102f, -0.033633437f, + -0.08677009f, 0.091591336f, -0.14165086f, 0.021752775f, + 0.019683983f, 0.0011612234f, -0.058154266f, 0.049996935f, + 0.0288841f, -0.0024567875f, -0.14345716f, 0.010955264f,-0.10234828f, + 0.1183656f, -0.0010731248f, -0.023590032f,-0.072285876f,-0.0724771f, + -0.026382286f, -0.0014920527f, 0.042667855f, 0.0018776858f, + 0.02986552f, 0.009814309f, 0.0733756f, 0.12289186f, + 0.018043943f, -0.0458958f, 0.049412545f, 0.033632483f, + 0.05495232f, 0.036686596f, -0.013781798f, -0.010036754f, + 0.02576849f, -0.08307328f, 0.010112348f, 0.042521734f, + -0.05869831f, -0.071689695f, 0.03876447f, -0.13275425f, -0.0352966f, + -0.023077697f, 0.10285965f, 0.084736146f, 0.15568255f, + -0.00040734606f, 0.027835453f, -0.10292561f, -0.032401145f, + 0.10053256f, -0.026142767f, -0.08271222f, -0.0030240538f, + -0.016368777f, 0.1070414f, 0.042672627f, 0.013456989f, + -0.0437609f, -0.022309763f, 0.11576483f, 0.04108048f, + 0.061026827f, -0.0190714f, -0.0869359f, 0.037901703f, 0.0610107f, + 0.07202949f, 0.01675338f, 0.086139716f, -0.08795751f, + -0.014898893f, -0.023771819f, -0.01965048f, 0.007955471f, + -0.043740474f, 0.03346837f, -0.10549954f, 0.090567775f, + 0.042013682f, -0.03176985f, 0.12569028f, -0.02421228f, + -0.029526481f, 0.023851605f, 0.031539805f, 0.05292009f, + -0.02344001f, -0.07811758f, -0.08834428f, 0.10094801f, + 0.16594367f, -0.06861939f, -0.021256343f, -0.041093912f, + -0.06669611f, 0.035498552f, 0.021757556f, -0.09302526f, + -0.015403468f, -0.06614931f, -0.051798206f, -0.013874718f, + 0.03630673f, 0.010412845f, -0.08077351f, 0.046185967f, + 0.0035662893f, 0.03541868f, -0.094149634f, -0.034814864f, + 0.003128424f, -0.020674974f, -0.03944324f, -0.008110165f, + -0.11113267f, 0.08484226f, 0.043586485f, 0.040582247f, + 0.0968012f, -0.065249965f, -0.028036479f, 0.0050708856f, + 0.0017462453f, 0.0326779f, 0.041296225f, 0.09164146f, + -0.047743853f, -0.015952192f, -0.034451712f, 0.084197424f, + -0.05347844f, -0.11768019f, 0.085926116f, -0.08251791f, + -0.045081906f, 0.0948852f, 0.068401024f, 0.024856757f, + 0.06978981f, -0.057309967f, -0.012775832f, -0.0032452994f, + 0.01977615f, -0.041040014f, -0.024264973f,0.063464895f, 0.05431621f + }); + + auto cellToInputWeights = + MakeTensor<float, 1>(tensorInfo20, {0.040369894f, 0.030746894f, 0.24704495f, 0.018586371f, -0.037586458f, + -0.15312155f, -0.11812848f, -0.11465643f, 0.20259799f, 0.11418174f, + -0.10116027f, -0.011334949f, 0.12411352f, -0.076769054f,-0.052169047f, + 0.21198851f, -0.38871562f, -0.09061183f, -0.09683246f, -0.21929175f + }); + + + auto cellToForgetWeights = + MakeTensor<float, 1>(tensorInfo20, {-0.01998659f,-0.15568835f,-0.24248174f, -0.012770197f, 0.041331276f, + -0.072311886f, -0.052123554f,-0.0066330447f,-0.043891653f,0.036225766f, + -0.047248036f, 0.021479502f,0.033189066f, 0.11952997f, -0.020432774f, + 0.64658105f, -0.06650122f, -0.03467612f, 0.095340036f, 0.23647355f + }); + + auto cellToOutputWeights = + MakeTensor<float, 1>(tensorInfo20, {0.08286371f, -0.08261836f, -0.51210177f, 0.002913762f, 0.17764764f, + -0.5495371f, -0.08460716f, -0.24552552f, 0.030037103f, 0.04123544f, + -0.11940523f, 0.007358328f, 0.1890978f, 0.4833202f, -0.34441817f, + 0.36312827f, -0.26375428f, 0.1457655f, -0.19724406f, 0.15548733f + }); + + auto projectionWeights = + MakeTensor<float, 2>(tensorInfo16x20, + {-0.009802181f, 0.09401916f, 0.0717386f, -0.13895074f, 0.09641832f, + 0.060420845f, 0.08539281f, 0.054285463f, 0.061395317f, 0.034448683f, + -0.042991187f, 0.019801661f, -0.16840284f, -0.015726732f, -0.23041931f, + -0.024478018f, -0.10959692f, -0.013875541f, 0.18600968f, -0.061274476f, + 0.0138165f, -0.08160894f, -0.07661644f, 0.032372914f, 0.16169067f, + 0.22465782f, -0.03993472f, -0.004017731f, 0.08633481f, -0.28869787f, + 0.08682067f, 0.17240396f, 0.014975425f, 0.056431185f, 0.031037588f, + 0.16702051f, 0.0077946745f, 0.15140012f, 0.29405436f, 0.120285f, + -0.188994f, -0.027265169f, 0.043389652f, -0.022061434f, 0.014777949f, + -0.20203483f, 0.094781205f, 0.19100232f, 0.13987629f, -0.036132768f, + -0.06426278f, -0.05108664f, 0.13221376f, 0.009441198f, -0.16715929f, + 0.15859416f, -0.040437475f, 0.050779544f, -0.022187516f, 0.012166504f, + 0.027685808f, -0.07675938f, -0.0055694645f, -0.09444123f, 0.0046453946f, + 0.050794356f, 0.10770313f, -0.20790008f, -0.07149004f, -0.11425117f, + 0.008225835f, -0.035802525f, 0.14374903f, 0.15262283f, 0.048710253f, + 0.1847461f, -0.007487823f, 0.11000021f, -0.09542012f, 0.22619456f, + -0.029149994f, 0.08527916f, 0.009043713f, 0.0042746216f, 0.016261552f, + 0.022461696f, 0.12689082f, -0.043589946f, -0.12035478f, -0.08361797f, + -0.050666027f, -0.1248618f, -0.1275799f, -0.071875185f, 0.07377272f, + 0.09944291f, -0.18897448f, -0.1593054f, -0.06526116f, -0.040107165f, + -0.004618631f, -0.067624845f, -0.007576253f, 0.10727444f, 0.041546922f, + -0.20424393f, 0.06907816f, 0.050412357f, 0.00724631f, 0.039827548f, + 0.12449835f, 0.10747581f, 0.13708383f, 0.09134148f, -0.12617786f, + -0.06428341f, 0.09956831f, 0.1208086f, -0.14676677f, -0.0727722f, + 0.1126304f, 0.010139365f, 0.015571211f, -0.038128063f, 0.022913318f, + -0.042050496f, 0.16842307f, -0.060597885f, 0.10531834f, -0.06411776f, + -0.07451711f, -0.03410368f, -0.13393489f, 0.06534304f, 0.003620307f, + 0.04490757f, 0.05970546f, 0.05197996f, 0.02839995f, 0.10434969f, + -0.013699693f, -0.028353551f, -0.07260381f, 0.047201227f, -0.024575593f, + -0.036445823f, 0.07155557f, 0.009672501f, -0.02328883f, 0.009533515f, + -0.03606021f, -0.07421458f, -0.028082801f, -0.2678904f, -0.13221288f, + 0.18419984f, -0.13012612f, -0.014588381f, -0.035059117f, -0.04824723f, + 0.07830115f, -0.056184657f, 0.03277091f, 0.025466874f, 0.14494097f, + -0.12522776f, -0.098633975f, -0.10766018f, -0.08317623f, 0.08594209f, + 0.07749552f, 0.039474737f, 0.1776665f, -0.07409566f, -0.0477268f, + 0.29323658f, 0.10801441f, 0.1154011f, 0.013952499f, 0.10739139f, + 0.10708251f, -0.051456142f, 0.0074137426f, -0.10430189f, 0.10034707f, + 0.045594677f, 0.0635285f, -0.0715442f, -0.089667566f, -0.10811871f, + 0.00026344223f, 0.08298446f, -0.009525053f, 0.006585689f, -0.24567553f, + -0.09450807f, 0.09648481f, 0.026996298f, -0.06419476f, -0.04752702f, + -0.11063944f, -0.23441927f, -0.17608605f, -0.052156363f, 0.067035615f, + 0.19271925f, -0.0032889997f, -0.043264326f, 0.09663576f, -0.057112187f, + -0.10100678f, 0.0628376f, 0.04447668f, 0.017961001f, -0.10094388f, + -0.10190601f, 0.18335468f, 0.10494553f, -0.052095775f, -0.0026118709f, + 0.10539724f, -0.04383912f, -0.042349473f, 0.08438151f, -0.1947263f, + 0.02251204f, 0.11216432f, -0.10307853f, 0.17351969f, -0.039091777f, + 0.08066188f, -0.00561982f, 0.12633002f, 0.11335965f, -0.0088127935f, + -0.019777594f, 0.06864014f, -0.059751723f, 0.016233567f, -0.06894641f, + -0.28651384f, -0.004228674f, 0.019708522f, -0.16305895f, -0.07468996f, + -0.0855457f, 0.099339016f, -0.07580735f, -0.13775392f, 0.08434318f, + 0.08330512f, -0.12131499f, 0.031935584f, 0.09180414f, -0.08876437f, + -0.08049874f, 0.008753825f, 0.03498998f, 0.030215185f, 0.03907079f, + 0.089751154f, 0.029194152f, -0.03337423f, -0.019092513f, 0.04331237f, + 0.04299654f, -0.036394123f, -0.12915532f, 0.09793732f, 0.07512415f, + -0.11319543f, -0.032502122f, 0.15661901f, 0.07671967f, -0.005491124f, + -0.19379048f, -0.218606f, 0.21448623f, 0.017840758f, 0.1416943f, + -0.07051762f, 0.19488361f, 0.02664691f, -0.18104725f, -0.09334311f, + 0.15026465f, -0.15493552f, -0.057762887f, -0.11604192f, -0.262013f, + -0.01391798f, 0.012185008f, 0.11156489f, -0.07483202f, 0.06693364f, + -0.26151478f, 0.046425626f, 0.036540434f, -0.16435726f, 0.17338543f, + -0.21401681f, -0.11385144f, -0.08283257f, -0.069031075f, 0.030635102f, + 0.010969227f, 0.11109743f, 0.010919218f, 0.027526086f, 0.13519906f, + 0.01891392f, -0.046839405f, -0.040167913f, 0.017953383f, -0.09700955f, + 0.0061885654f, -0.07000971f, 0.026893595f, -0.038844477f, 0.14543656f + }); + + std::vector<float> projectionBiasVector(outputSize, 0.f); + auto projectionBias = MakeTensor<float,1>(tensorInfo16, projectionBiasVector); + + armnn::ScopedCpuTensorHandle inputToInputWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfo20x5); + armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle recurrentToInputWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfo20x16); + armnn::ScopedCpuTensorHandle cellToInputWeightsTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle inputGateBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfo20); + armnn::ScopedCpuTensorHandle projectionWeightsTensor(tensorInfo16x20); + armnn::ScopedCpuTensorHandle projectionBiasTensor(tensorInfo16); + + AllocateAndCopyDataToITensorHandle(&inputToInputWeightsTensor, &inputToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToInputWeightsTensor, &recurrentToInputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&cellToInputWeightsTensor, &cellToInputWeights[0]); + AllocateAndCopyDataToITensorHandle(&inputGateBiasTensor, &inputGateBias[0]); + AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]); + AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]); + AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]); + AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]); + AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]); + AllocateAndCopyDataToITensorHandle(&projectionWeightsTensor, &projectionWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&projectionBiasTensor, &projectionBias[0]); + + data.m_InputToInputWeights = &inputToInputWeightsTensor; + data.m_InputToForgetWeights = &inputToForgetWeightsTensor; + data.m_InputToCellWeights = &inputToCellWeightsTensor; + data.m_InputToOutputWeights = &inputToOutputWeightsTensor; + data.m_RecurrentToInputWeights = &recurrentToInputWeightsTensor; + data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor; + data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor; + data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor; + data.m_CellToInputWeights = &cellToInputWeightsTensor; + data.m_InputGateBias = &inputGateBiasTensor; + data.m_ForgetGateBias = &forgetGateBiasTensor; + data.m_CellBias = &cellBiasTensor; + data.m_OutputGateBias = &outputGateBiasTensor; + data.m_CellToForgetWeights = &cellToForgetWeightsTensor; + data.m_CellToOutputWeights = &cellToOutputWeightsTensor; + data.m_ProjectionWeights = &projectionWeightsTensor; + data.m_ProjectionBias = &projectionBiasTensor; + + // Flags to set test configuration + data.m_Parameters.m_ActivationFunc = 4; + data.m_Parameters.m_CifgEnabled = false; + data.m_Parameters.m_PeepholeEnabled = true; + data.m_Parameters.m_ProjectionEnabled = true; + + + std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info); + inputHandle->Allocate(); + outputStateInHandle->Allocate(); + cellStateInHandle->Allocate(); + + scratchHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]); + CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]); + CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]); + + workloadFactory.Finalize(); + workload->Execute(); + + CopyDataFromITensorHandle(&ret.output[0][0], outputHandle.get()); + + return ret; + +} + + +LayerTestResult<float, 2> LstmLayerWithCifgWithPeepholeNoProjectionTestImpl(armnn::IWorkloadFactory& workloadFactory, + const boost::multi_array<float, 2>& input, + const boost::multi_array<float, 2>& outputExpected) +{ + bool cifgEnabled = true; + bool peepholeEnabled = true; + bool projectionEnabled = false; + // These are not the input and the output of Lstm yet + unsigned int batchSize = boost::numeric_cast<unsigned int>(input.shape()[0]); + unsigned int inputSize = boost::numeric_cast<unsigned int>(input.shape()[1]); + + unsigned int outputSize = boost::numeric_cast<unsigned int>(outputExpected.shape()[1]); + + const unsigned int cellSize = outputSize; + + // Decide the shape of all input tensors + armnn::TensorInfo inputTensorInfo({batchSize , inputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo outputStateInTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo cellStateInTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>()); + + unsigned int scratchBufferSize = cifgEnabled ? cellSize * 4 : cellSize * 3; + armnn::TensorInfo scratchBufferTensorInfo({batchSize, scratchBufferSize}, armnn::GetDataType<float>()); + armnn::TensorInfo outputStateOutTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo cellStateOutTensorInfo({batchSize, cellSize}, armnn::GetDataType<float>()); + armnn::TensorInfo outputTensorInfo({batchSize, outputSize}, armnn::GetDataType<float>()); + + // List of inputs + std::vector<float> inputData; + inputData.assign(input.data(), input.data() + batchSize*inputSize); + auto inputTensor = MakeTensor<float,2>(inputTensorInfo, inputData); + + std::vector<float> outputStateInVector(batchSize * outputSize, 0.f); + auto outputStateInTensor = MakeTensor<float, 2>(outputStateInTensorInfo, outputStateInVector); + + std::vector<float> cellStateInVector(batchSize * cellSize, 0.f); + auto cellStateInTensor = MakeTensor<float, 2>(cellStateInTensorInfo, cellStateInVector); + + + // Prepare all the weights in the descriptor for LSTM + armnn::LstmQueueDescriptor data; + armnn::TensorInfo tensorInfoInput({cellSize, inputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfoOutput({cellSize, outputSize}, armnn::GetDataType<float>()); + armnn::TensorInfo tensorInfoNumUnits({cellSize}, armnn::GetDataType<float>()); + + auto inputToCellWeights = MakeTensor<float, 2>(tensorInfoInput, + {-0.49770179f, -0.27711356f, -0.09624726f, 0.05100781f, + 0.04717243f, 0.48944736f, -0.38535351f, + -0.17212132f}); + auto inputToForgetWeights = MakeTensor<float, 2>(tensorInfoInput, + {-0.55291498f, -0.42866567f, 0.13056988f, + -0.3633365f, -0.22755712f, 0.28253698f, 0.24407166f, + 0.33826375f}); + auto inputToOutputWeights = MakeTensor<float, 2>(tensorInfoInput, + {0.10725588f, -0.02335852f, -0.55932593f, + -0.09426838f, -0.44257352f, 0.54939759f, + 0.01533556f, 0.42751634f}); + auto cellBias = MakeTensor<float, 1>(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f}); + auto forgetGateBias = MakeTensor<float, 1>(tensorInfoNumUnits, {1.f, 1.f, 1.f, 1.f}); + auto outputGateBias = MakeTensor<float, 1>(tensorInfoNumUnits, {0.f, 0.f, 0.f, 0.f}); + + auto recurrentToCellWeights = MakeTensor<float, 2>(tensorInfoOutput, + {0.54066205f, -0.32668582f, -0.43562764f, -0.56094903f, 0.42957711f, + 0.01841056f, -0.32764608f, -0.33027974f, -0.10826075f, 0.20675004f, + 0.19069612f, -0.03026325f, -0.54532051f, 0.33003211f, 0.44901288f, + 0.21193194f}); + auto recurrentToForgetWeights = MakeTensor<float, 2>(tensorInfoOutput, + {-0.13832897f, -0.0515101f, -0.2359007f, -0.16661474f, -0.14340827f, + 0.36986142f, 0.23414481f, 0.55899f, 0.10798943f, -0.41174671f, 0.17751795f, + -0.34484994f, -0.35874045f, -0.11352962f, 0.27268326f, 0.54058349f}); + + auto recurrentToOutputWeights = MakeTensor<float, 2>(tensorInfoOutput, + {0.41613156f, 0.42610586f, -0.16495961f, -0.5663873f, 0.30579174f, -0.05115908f, + -0.33941799f, 0.23364776f, 0.11178309f, 0.09481031f, -0.26424935f, 0.46261835f, + 0.50248802f, 0.26114327f, -0.43736315f, 0.33149987f}); + + auto cellToForgetWeights = MakeTensor<float, 1>(tensorInfoNumUnits, + {0.47485286f, -0.51955009f, -0.24458408f, 0.31544167f}); + auto cellToOutputWeights = MakeTensor<float, 1>(tensorInfoNumUnits, + {-0.17135078f, 0.82760304f, 0.85573703f, -0.77109635f}); + + armnn::ScopedCpuTensorHandle inputToCellWeightsTensor(tensorInfoInput); + armnn::ScopedCpuTensorHandle inputToForgetWeightsTensor(tensorInfoInput); + armnn::ScopedCpuTensorHandle inputToOutputWeightsTensor(tensorInfoInput); + + armnn::ScopedCpuTensorHandle cellBiasTensor(tensorInfoNumUnits); + armnn::ScopedCpuTensorHandle forgetGateBiasTensor(tensorInfoNumUnits); + armnn::ScopedCpuTensorHandle outputGateBiasTensor(tensorInfoNumUnits); + + armnn::ScopedCpuTensorHandle recurrentToCellWeightsTensor(tensorInfoOutput); + armnn::ScopedCpuTensorHandle recurrentToForgetWeightsTensor(tensorInfoOutput); + armnn::ScopedCpuTensorHandle recurrentToOutputWeightsTensor(tensorInfoOutput); + + + armnn::ScopedCpuTensorHandle cellToForgetWeightsTensor(tensorInfoNumUnits); + armnn::ScopedCpuTensorHandle cellToOutputWeightsTensor(tensorInfoNumUnits); + + AllocateAndCopyDataToITensorHandle(&inputToCellWeightsTensor, &inputToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToForgetWeightsTensor, &inputToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&inputToOutputWeightsTensor, &inputToOutputWeights[0][0]); + + AllocateAndCopyDataToITensorHandle(&cellBiasTensor, &cellBias[0]); + AllocateAndCopyDataToITensorHandle(&forgetGateBiasTensor, &forgetGateBias[0]); + AllocateAndCopyDataToITensorHandle(&outputGateBiasTensor, &outputGateBias[0]); + + AllocateAndCopyDataToITensorHandle(&recurrentToCellWeightsTensor, &recurrentToCellWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToForgetWeightsTensor, &recurrentToForgetWeights[0][0]); + AllocateAndCopyDataToITensorHandle(&recurrentToOutputWeightsTensor, &recurrentToOutputWeights[0][0]); + + AllocateAndCopyDataToITensorHandle(&cellToForgetWeightsTensor, &cellToForgetWeights[0]); + AllocateAndCopyDataToITensorHandle(&cellToOutputWeightsTensor, &cellToOutputWeights[0]); + + + data.m_InputToCellWeights = &inputToCellWeightsTensor; + data.m_InputToForgetWeights = &inputToForgetWeightsTensor; + data.m_InputToOutputWeights = &inputToOutputWeightsTensor; + + data.m_CellBias = &cellBiasTensor; + data.m_ForgetGateBias = &forgetGateBiasTensor; + data.m_OutputGateBias = &outputGateBiasTensor; + + data.m_RecurrentToCellWeights = &recurrentToCellWeightsTensor; + data.m_RecurrentToForgetWeights = &recurrentToForgetWeightsTensor; + data.m_RecurrentToOutputWeights = &recurrentToOutputWeightsTensor; + + data.m_CellToForgetWeights = &cellToForgetWeightsTensor; + data.m_CellToOutputWeights = &cellToOutputWeightsTensor; + + // other parameters for the descriptor + data.m_Parameters.m_CifgEnabled = cifgEnabled; + data.m_Parameters.m_ProjectionEnabled = projectionEnabled; + data.m_Parameters.m_PeepholeEnabled = peepholeEnabled; + + data.m_Parameters.m_ActivationFunc = 4; + data.m_Parameters.m_ClippingThresProj = 0.0; + data.m_Parameters.m_ClippingThresCell = 0.0; + + + // List of outputs + std::vector<float> scratchBufferVector(batchSize * scratchBufferSize, 0.f); + auto scratchBufferTensor = MakeTensor<float,2>(scratchBufferTensorInfo, scratchBufferVector); + LayerTestResult<float, 2> ret0(scratchBufferTensorInfo); + + // Output state for a certain time step + std::vector<float> outputStateOutVector(batchSize * outputSize, 0.f); + auto outputStateOutTensor = MakeTensor<float,2>(outputStateOutTensorInfo, outputStateOutVector); + LayerTestResult<float, 2> ret1(outputStateOutTensorInfo); + + // Cell state for a certain time step + std::vector<float> cellStateOutVector(batchSize * cellSize, 0.f); + auto cellStateOutTensor = MakeTensor<float,2>(cellStateOutTensorInfo, cellStateOutVector); + LayerTestResult<float, 2> ret2(cellStateOutTensorInfo); + + // Output for a certain time step + std::vector<float> outputVector(batchSize * outputSize, 0.f); + auto outputTensor = MakeTensor<float, 2>(outputTensorInfo, outputVector); + std::vector<float> outputData; + outputData.assign(outputExpected.data(), outputExpected.data() + batchSize*outputSize); + LayerTestResult<float, 2> ret3(outputTensorInfo); + ret3.outputExpected = MakeTensor<float, 2>(outputTensorInfo, outputData); + + // Prepare the inputs and outputs for the workload + std::unique_ptr<armnn::ITensorHandle> inputHandle = + workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputStateInHandle = + workloadFactory.CreateTensorHandle(outputStateInTensorInfo); + std::unique_ptr<armnn::ITensorHandle> cellStateInHandle = + workloadFactory.CreateTensorHandle(cellStateInTensorInfo); + + std::unique_ptr<armnn::ITensorHandle> scratchBufferHandle = + workloadFactory.CreateTensorHandle(scratchBufferTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputStateOutHandle = + workloadFactory.CreateTensorHandle(outputStateOutTensorInfo); + std::unique_ptr<armnn::ITensorHandle> cellStateOutHandle = + workloadFactory.CreateTensorHandle(cellStateOutTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputHandle = + workloadFactory.CreateTensorHandle(outputTensorInfo); + + armnn::WorkloadInfo info; + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddInputToWorkload(data, info, outputStateInTensorInfo, outputStateInHandle.get()); + AddInputToWorkload(data, info, cellStateInTensorInfo, cellStateInHandle.get()); + + AddOutputToWorkload(data, info, scratchBufferTensorInfo, scratchBufferHandle.get()); + AddOutputToWorkload(data, info, outputStateOutTensorInfo, outputStateOutHandle.get()); + AddOutputToWorkload(data, info, cellStateOutTensorInfo, cellStateOutHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + + std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateLstm(data, info); + + + inputHandle->Allocate(); + outputStateInHandle->Allocate(); + cellStateInHandle->Allocate(); + + scratchBufferHandle->Allocate(); + outputStateOutHandle->Allocate(); + cellStateOutHandle->Allocate(); + outputHandle->Allocate(); + + + CopyDataToITensorHandle(inputHandle.get(), &inputTensor[0][0]); + CopyDataToITensorHandle(outputStateInHandle.get(), &outputStateInTensor[0][0]); + CopyDataToITensorHandle(cellStateInHandle.get(), &cellStateInTensor[0][0]); + + CopyDataToITensorHandle(scratchBufferHandle.get(), &scratchBufferTensor[0][0]); + CopyDataToITensorHandle(outputStateOutHandle.get(), &outputStateOutTensor[0][0]); + CopyDataToITensorHandle(cellStateOutHandle.get(), &cellStateOutTensor[0][0]); + + workloadFactory.Finalize(); + workload->Execute(); + + CopyDataFromITensorHandle(&ret0.output[0][0], scratchBufferHandle.get()); + CopyDataFromITensorHandle(&ret1.output[0][0], outputStateOutHandle.get()); + CopyDataFromITensorHandle(&ret2.output[0][0], cellStateOutHandle.get()); + CopyDataFromITensorHandle(&ret3.output[0][0], outputHandle.get()); + + return ret3; +} diff --git a/src/armnn/backends/test/MemCopyTests.cpp b/src/armnn/backends/test/MemCopyTests.cpp index 32331789e9..24a951c395 100644 --- a/src/armnn/backends/test/MemCopyTests.cpp +++ b/src/armnn/backends/test/MemCopyTests.cpp @@ -19,6 +19,10 @@ #include "TensorCopyUtils.hpp" #include "WorkloadTestUtils.hpp" +#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED +#include "../ArmComputeTensorUtils.hpp" +#endif + BOOST_AUTO_TEST_SUITE(MemCopyTestSuite) void MemCopyTest(armnn::IWorkloadFactory& srcWorkloadFactory, armnn::IWorkloadFactory& dstWorkloadFactory, @@ -81,6 +85,26 @@ void MemCopyTest(bool withSubtensors) MemCopyTest(srcWorkloadFactory, dstWorkloadFactory, withSubtensors); } +#if ARMCOMPUTECL_ENABLED || ARMCOMPUTENEON_ENABLED + +BOOST_AUTO_TEST_CASE(AclTypeConversions) +{ + arm_compute::Strides strides(1,2,3,4); + armnn::TensorShape convertedStrides = armnn::armcomputetensorutils::GetStrides(strides); + BOOST_TEST(convertedStrides[0] == 4); + BOOST_TEST(convertedStrides[1] == 3); + BOOST_TEST(convertedStrides[2] == 2); + BOOST_TEST(convertedStrides[3] == 1); + + arm_compute::TensorShape shape(5,6,7,8); + armnn::TensorShape convertedshape = armnn::armcomputetensorutils::GetShape(shape); + BOOST_TEST(convertedshape[0] == 8); + BOOST_TEST(convertedshape[1] == 7); + BOOST_TEST(convertedshape[2] == 6); + BOOST_TEST(convertedshape[3] == 5); +} +#endif + #if ARMCOMPUTECL_ENABLED BOOST_AUTO_TEST_CASE(CopyBetweenCpuAndGpu) diff --git a/src/armnn/backends/test/NormTestImpl.hpp b/src/armnn/backends/test/NormTestImpl.hpp index d9dc01592a..df8219ddbd 100644 --- a/src/armnn/backends/test/NormTestImpl.hpp +++ b/src/armnn/backends/test/NormTestImpl.hpp @@ -87,7 +87,7 @@ LayerTestResult<float,4> SimpleNormalizationTestImpl(armnn::IWorkloadFactory& wo // When normalising within channels, the 3x3 kernel covers the entire 2x2 input at every index. // Therefore, all output values should equal the inputs, but divided by: // pow((kappa + (accumulatedScale * alpha)), beta) - // ...where accumulatedScale is the sum of every element squared + // ...where accumulatedScale is the sum of every element squared. float divisor[inputNum]; for(int i = 0; i < boost::numeric_cast<int>(inputNum); i++) { @@ -139,7 +139,7 @@ LayerTestResult<float,4> SimpleNormalizationTestImpl(armnn::IWorkloadFactory& wo } break; } - case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough + case armnn::NormalizationAlgorithmMethod::LocalContrast: // NOTE: intentional fallthrough. default: { throw armnn::UnimplementedException("Unsupported normalisation method type, " diff --git a/src/armnn/backends/test/Pooling2dTestImpl.hpp b/src/armnn/backends/test/Pooling2dTestImpl.hpp index ab9fd6d6fb..e6e0e6721a 100644 --- a/src/armnn/backends/test/Pooling2dTestImpl.hpp +++ b/src/armnn/backends/test/Pooling2dTestImpl.hpp @@ -155,21 +155,21 @@ LayerTestResult<T, 4> SimpleMaxPooling2dSize3x3Stride2x4TestCommon(armnn::IWorkl 3.0f, 5.0f, 4.0f, 0.0f, 1.0f, 5.0f, 9.0f, 7.0f, }); - // Construct input data + // Constructs input data. std::vector<float> inputData; auto negator = [](float f) { return -f; }; - // First image (two channels where the second channel is the negative of the first one) + // First image (two channels where the second channel is the negative of the first one). inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end()); std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator); - // Second image (same as first image) + // Second image (same as first image). inputData.insert(inputData.end(), singleChannelData.begin(), singleChannelData.end()); std::transform(singleChannelData.begin(), singleChannelData.end(), std::back_inserter(inputData), negator); auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, inputData)); - // these were calculated manually + // These were calculated manually. auto shape(GetTensorShapeAsArray<4>(outputTensorInfo)); boost::multi_array<T, 4> outputExpected(shape); if (forceNoPadding) @@ -527,13 +527,13 @@ LayerTestResult<T, 4> AsymmetricNonSquarePooling2dTestCommon(armnn::IWorkloadFac descriptor.m_OutputShapeRounding = armnn::OutputShapeRounding::Floor; descriptor.m_PaddingMethod = armnn::PaddingMethod::Exclude; - // Construct input data + // Construct input data. auto input = MakeTensor<T, 4>(inputTensorInfo, QuantizedVector<T>(qScale, qOffset, { 1.0f, 3.0f, 4.0f, })); - // these were calculated manually + // These were calculated manually. auto outputExpected = MakeTensor<T, 4>(outputTensorInfo, QuantizedVector<T>(qScale, qOffset, { 0.0f, 3.0f, 0.0f, 3.0f, @@ -686,7 +686,7 @@ LayerTestResult<T, 4> SimpleMaxPooling2dSize2x2Stride2x2TestCommon(armnn::IWorkl 438.0f, 564.0f, 573.0f, 402.0f }; - // Note that left and right edges will be 0.f, due to the 2x2 max pooling only accessing zeros here + // Note that left and right edges will be 0.f, due to the 2x2 max pooling only accessing zeros here. std::vector<float> expectedOutputDataWithPadding = { 0.0f, 510.0f, 780.0f, 654.0f, 0.0f, 0.0f, 438.0f, 618.0f, 402.0f, 0.0f diff --git a/src/armnn/backends/test/QuantizeHelper.hpp b/src/armnn/backends/test/QuantizeHelper.hpp index bfaf9342f0..0a6ceb761d 100644 --- a/src/armnn/backends/test/QuantizeHelper.hpp +++ b/src/armnn/backends/test/QuantizeHelper.hpp @@ -61,7 +61,7 @@ struct IsFloatingPointIterator }; template <typename T, typename FloatIt, -typename std::enable_if<IsFloatingPointIterator<FloatIt>::value, int>::type=0 // Make sure valid fp iterator +typename std::enable_if<IsFloatingPointIterator<FloatIt>::value, int>::type=0 // Makes sure fp iterator is valid. > std::vector<T> QuantizedVector(float qScale, int32_t qOffset, FloatIt first, FloatIt last) { diff --git a/src/armnn/backends/test/Reference.cpp b/src/armnn/backends/test/Reference.cpp index b60483a4d9..dedeb50e33 100644 --- a/src/armnn/backends/test/Reference.cpp +++ b/src/armnn/backends/test/Reference.cpp @@ -127,25 +127,8 @@ ARMNN_AUTO_TEST_CASE(FullyConnectedLarge, FullyConnectedLargeTest, false) ARMNN_AUTO_TEST_CASE(FullyConnectedLargeTransposed, FullyConnectedLargeTest, true) // Splitter -BOOST_AUTO_TEST_CASE(SimpleSplitter) -{ - armnn::RefWorkloadFactory workloadFactory; - auto testResult = SplitterTest(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} - -BOOST_AUTO_TEST_CASE(SplitterUint8) -{ - armnn::RefWorkloadFactory workloadFactory; - auto testResult = SplitterUint8Test(workloadFactory); - for (unsigned int i = 0; i < testResult.size(); ++i) - { - BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); - } -} +ARMNN_AUTO_TEST_CASE(SimpleSplitter, SplitterTest) +ARMNN_AUTO_TEST_CASE(SimpleSplitterUint8, SplitterUint8Test) ARMNN_AUTO_TEST_CASE(CopyViaSplitter, CopyViaSplitterTest) ARMNN_AUTO_TEST_CASE(CopyViaSplitterUint8, CopyViaSplitterUint8Test) @@ -242,4 +225,9 @@ ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet1, PermuteFloat32ValueSet1Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet2, PermuteFloat32ValueSet2Test) ARMNN_AUTO_TEST_CASE(PermuteFloat32ValueSet3, PermuteFloat32ValueSet3Test) +// Convert from Float16 to Float32 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp16ToFp32, SimpleConvertFp16ToFp32Test) +// Convert from Float32 to Float16 +ARMNN_AUTO_TEST_CASE(SimpleConvertFp32ToFp16, SimpleConvertFp32ToFp16Test) + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/backends/test/SoftmaxTestImpl.hpp b/src/armnn/backends/test/SoftmaxTestImpl.hpp index 4c3e0b73dd..9ed7f603a1 100644 --- a/src/armnn/backends/test/SoftmaxTestImpl.hpp +++ b/src/armnn/backends/test/SoftmaxTestImpl.hpp @@ -39,7 +39,7 @@ LayerTestResult<T, 2> SimpleSoftmaxTestImpl(armnn::IWorkloadFactory& workloadFac LayerTestResult<T, 2> ret(outputTensorInfo); - // Each row is independently softmax'd + // Each row is independently softmax'd. auto input = MakeTensor<T, 2>(inputTensorInfo, std::vector<T>( QuantizedVector<T>(qScale, 0, { 0.f, 1.f, 0.f, 0.f, diff --git a/src/armnn/backends/test/SplitterTestImpl.hpp b/src/armnn/backends/test/SplitterTestImpl.hpp index 70b798eafa..48c0730fa7 100644 --- a/src/armnn/backends/test/SplitterTestImpl.hpp +++ b/src/armnn/backends/test/SplitterTestImpl.hpp @@ -27,35 +27,35 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo // NOTE: Compute Library imposes a restriction that the x and y dimension (input height and width) // cannot be split. - // For the reasons for this see first comment on https://jira.arm.com/browse/IVGCVSW-1239 + // For the reasons for this, see first comment on https://jira.arm.com/browse/IVGCVSW-1239 // - // this test has therefore been recast to split the channels, then split the resulting subtensor + // This test has therefore been recast to split the channels, then split the resulting subtensor. - // to take channel 0 of original output - // and channel 0 and channel 1 of the split subtensor + // To take channel 0 of original output + // and channel 0 and channel 1 of the split subtensor. unsigned int outputWidth1 = inputWidth; unsigned int outputHeight1 = inputHeight; unsigned int outputChannels1 = 1; - // to take channel 1 and 2 of the original output + // To take channel 1 and 2 of the original output. unsigned int outputWidth2 = inputWidth; unsigned int outputHeight2 = inputHeight; unsigned int outputChannels2 = 2; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo({ inputChannels, inputHeight, inputWidth }, armnn::GetDataType<T>()); - // outputs of the original split + // Outputs of the original split. armnn::TensorInfo outputTensorInfo1({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType<T>()); armnn::TensorInfo outputTensorInfo2({ outputChannels2, outputHeight2, outputWidth2 }, armnn::GetDataType<T>()); - // outputs of the subsequent subtensor split + // Outputs of the subsequent subtensor split. armnn::TensorInfo outputTensorInfo3({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType<T>()); armnn::TensorInfo outputTensorInfo4({ outputChannels1, outputHeight1, outputWidth1 }, armnn::GetDataType<T>()); // Set quantization parameters if the requested type is a quantized type. - // The quantization doesn't really matter as the splitter operator doesn't dequantize/quantize + // The quantization doesn't really matter as the splitter operator doesn't dequantize/quantize. if(armnn::IsQuantizedType<T>()) { inputTensorInfo.SetQuantizationScale(qScale); @@ -100,7 +100,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 0 of the original input + // Channel 0 of the original input. ret1.outputExpected = MakeTensor<T, 3>(outputTensorInfo1, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { 1.0f, 2.0f, 3.0f, 4.0f, 5.0f, @@ -112,7 +112,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 1 & 2 of the original input + // Channel 1 & 2 of the original input. ret2.outputExpected = MakeTensor<T, 3>(outputTensorInfo2, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, @@ -131,7 +131,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 0 of return 2 (i.e. channels 1 and 2 of the original input) + // Channel 0 of return 2 (i.e. channels 1 and 2 of the original input). ret3.outputExpected = MakeTensor<T, 3>(outputTensorInfo3, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { 31.0f, 32.0f, 33.0f, 34.0f, 35.0f, @@ -143,7 +143,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // channel 1 of return 2 + // Channel 1 of return 2. ret4.outputExpected = MakeTensor<T, 3>(outputTensorInfo4, std::vector<T>( QuantizedVector<T>(qScale, qOffset, { 61.0f, 62.0f, 63.0f, 64.0f, 65.0f, @@ -155,19 +155,19 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo }) )); - // NOTE: as a corollary of the no splitting of x and y restriction the x and y values of the view origins + // NOTE: as a corollary of the splitting of x and y restriction the x and y values of the view origins // have to be zero, the co-ordinates are as per the tensor info above channels, height/y, width/x - // note that under the hood the compute engine reverses these i.e. its coordinate system is x, y, channels - std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //extent of the window is defined by size of output[0] + // note that under the hood the compute engine reverses these i.e. its coordinate system is x, y, channels. + std::vector<unsigned int> wOrigin1 = {0, 0, 0}; //Extent of the window is defined by size of output[0]. armnn::SplitterQueueDescriptor::ViewOrigin window1(wOrigin1); - std::vector<unsigned int> wOrigin2 = {1, 0, 0}; //extent of the window is defined by size of output[1] + std::vector<unsigned int> wOrigin2 = {1, 0, 0}; //Extent of the window is defined by size of output[1]. armnn::SplitterQueueDescriptor::ViewOrigin window2(wOrigin2); - std::vector<unsigned int> wOrigin3 = {0, 0, 0}; //extent of the window is defined by size of output[2] + std::vector<unsigned int> wOrigin3 = {0, 0, 0}; //Extent of the window is defined by size of output[2]. armnn::SplitterQueueDescriptor::ViewOrigin window3(wOrigin3); - std::vector<unsigned int> wOrigin4 = {1, 0, 0}; //extent of the window is defined by size of output[3] + std::vector<unsigned int> wOrigin4 = {1, 0, 0}; //Extent of the window is defined by size of output[3]. armnn::SplitterQueueDescriptor::ViewOrigin window4(wOrigin4); bool subTensorsSupported = workloadFactory.SupportsSubTensors(); @@ -217,7 +217,7 @@ std::vector<LayerTestResult<T,3>> SplitterTestCommon(armnn::IWorkloadFactory& wo CopyDataFromITensorHandle(&ret1.output[0][0][0], outputHandle1.get()); CopyDataFromITensorHandle(&ret2.output[0][0][0], outputHandle2.get()); -// // Do the second split +// // Do the second split. armnn::SplitterQueueDescriptor data2; armnn::WorkloadInfo info2; AddInputToWorkload(data2, info2, outputTensorInfo2, outputHandle2.get()); diff --git a/src/armnn/backends/test/TensorCopyUtils.cpp b/src/armnn/backends/test/TensorCopyUtils.cpp index e15c12a76f..82e80a52fe 100644 --- a/src/armnn/backends/test/TensorCopyUtils.cpp +++ b/src/armnn/backends/test/TensorCopyUtils.cpp @@ -6,6 +6,7 @@ #include <algorithm> #include <cstring> #include <boost/cast.hpp> +#include <Half.hpp> #include "TensorCopyUtils.hpp" @@ -47,12 +48,15 @@ void CopyDataToITensorHandle(armnn::ITensorHandle* tensorHandle, const void* mem case arm_compute::DataType::QASYMM8: CopyArmComputeITensorData(static_cast<const uint8_t*>(mem), handle->GetTensor()); break; + case arm_compute::DataType::F16: + CopyArmComputeITensorData(static_cast<const armnn::Half*>(mem), handle->GetTensor()); + break; default: { throw armnn::UnimplementedException(); } } - handle->UnMap(); + handle->Unmap(); break; } #endif @@ -108,12 +112,15 @@ void CopyDataFromITensorHandle(void* mem, const armnn::ITensorHandle* tensorHand case arm_compute::DataType::QASYMM8: CopyArmComputeITensorData(handle->GetTensor(), static_cast<uint8_t*>(mem)); break; + case arm_compute::DataType::F16: + CopyArmComputeITensorData(handle->GetTensor(), static_cast<armnn::Half*>(mem)); + break; default: { throw armnn::UnimplementedException(); } } - const_cast<armnn::IClTensorHandle*>(handle)->UnMap(); + const_cast<armnn::IClTensorHandle*>(handle)->Unmap(); break; } #endif diff --git a/src/armnn/backends/test/WorkloadDataValidation.cpp b/src/armnn/backends/test/WorkloadDataValidation.cpp index c3a9d40116..bc3898b405 100644 --- a/src/armnn/backends/test/WorkloadDataValidation.cpp +++ b/src/armnn/backends/test/WorkloadDataValidation.cpp @@ -22,7 +22,7 @@ BOOST_AUTO_TEST_CASE(QueueDescriptor_Validate_WrongNumOfInputsOutputs) { InputQueueDescriptor invalidData; WorkloadInfo invalidInfo; - //invalid argument exception is expected, because no inputs and no outputs were defined + //Invalid argument exception is expected, because no inputs and no outputs were defined. BOOST_CHECK_THROW(RefWorkloadFactory().CreateInput(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -31,7 +31,7 @@ BOOST_AUTO_TEST_CASE(RefPooling2dFloat32Workload_Validate_WrongDimTensor) armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; - unsigned int inputShape[] = {2, 3, 4}; // <- invalid - input tensor has to be 4D + unsigned int inputShape[] = {2, 3, 4}; // <- Invalid - input tensor has to be 4D. unsigned int outputShape[] = {2, 3, 4, 5}; outputTensorInfo = armnn::TensorInfo(4, outputShape, armnn::DataType::Float32); @@ -43,7 +43,7 @@ BOOST_AUTO_TEST_CASE(RefPooling2dFloat32Workload_Validate_WrongDimTensor) AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); - // invalid argument exception is expected, input tensor has to be 4D + // Invalid argument exception is expected, input tensor has to be 4D. BOOST_CHECK_THROW(RefPooling2dFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -55,7 +55,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxQueueDescriptor_Validate_WrongInputHeight) unsigned int inputNum = 2; unsigned int outputChannels = inputChannels; - unsigned int outputHeight = inputHeight + 1; //makes data invalid - Softmax expects height and width to be 1 + unsigned int outputHeight = inputHeight + 1; //Makes data invalid - Softmax expects height and width to be 1. unsigned int outputWidth = inputWidth; unsigned int outputNum = inputNum; @@ -74,7 +74,7 @@ BOOST_AUTO_TEST_CASE(SoftmaxQueueDescriptor_Validate_WrongInputHeight) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - //invalid argument exception is expected, because height != 1 + //Invalid argument exception is expected, because height != 1. BOOST_CHECK_THROW(RefSoftmaxFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -90,7 +90,7 @@ BOOST_AUTO_TEST_CASE(FullyConnectedQueueDescriptor_Validate_RequiredDataMissing) unsigned int outputChannels = 3; unsigned int outputNum = 2; - // Define the tensor descriptors + // Define the tensor descriptors. armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; armnn::TensorInfo weightsDesc; @@ -120,8 +120,8 @@ BOOST_AUTO_TEST_CASE(FullyConnectedQueueDescriptor_Validate_RequiredDataMissing) invalidData.m_Parameters.m_TransposeWeightMatrix = false; - //invalid argument exception is expected, because not all required fields have been provided - //in particular inputsData[0], outputsData[0] and weightsData can not be null + //Invalid argument exception is expected, because not all required fields have been provided. + //In particular inputsData[0], outputsData[0] and weightsData can not be null. BOOST_CHECK_THROW(RefFullyConnectedFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -135,8 +135,8 @@ BOOST_AUTO_TEST_CASE(NormalizationQueueDescriptor_Validate_WrongInputHeight) constexpr unsigned int outputNum = inputNum; constexpr unsigned int outputChannels = inputChannels; - constexpr unsigned int outputHeight = inputHeight + 1; //makes data invalid - normalization requires - //input and output to have the same dimensions + constexpr unsigned int outputHeight = inputHeight + 1; //Makes data invalid - normalization requires. + //Input and output to have the same dimensions. constexpr unsigned int outputWidth = inputWidth; @@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(NormalizationQueueDescriptor_Validate_WrongInputHeight) invalidData.m_Parameters.m_Beta = beta; invalidData.m_Parameters.m_K = kappa; - //invalid argument exception is expected, because input height != output height + //Invalid argument exception is expected, because input height != output height. BOOST_CHECK_THROW(RefNormalizationFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -201,7 +201,7 @@ BOOST_AUTO_TEST_CASE(SplitterQueueDescriptor_Validate_WrongWindow) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // invalid since it has only 3 dimensions while the input tensor is 4d + // Invalid, since it has only 3 dimensions while the input tensor is 4d. std::vector<unsigned int> wOrigin = {0, 0, 0}; armnn::SplitterQueueDescriptor::ViewOrigin window(wOrigin); invalidData.m_ViewOrigins.push_back(window); @@ -210,7 +210,7 @@ BOOST_AUTO_TEST_CASE(SplitterQueueDescriptor_Validate_WrongWindow) "match input."); BOOST_CHECK_THROW(RefSplitterFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); - // invalid since window extends past the boundary of input tensor + // Invalid, since window extends past the boundary of input tensor. std::vector<unsigned int> wOrigin3 = {0, 0, 15, 0}; armnn::SplitterQueueDescriptor::ViewOrigin window3(wOrigin3); invalidData.m_ViewOrigins[0] = window3; @@ -259,7 +259,7 @@ BOOST_AUTO_TEST_CASE(MergerQueueDescriptor_Validate_WrongWindow) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // invalid since it has only 3 dimensions while the input tensor is 4d + // Invalid, since it has only 3 dimensions while the input tensor is 4d. std::vector<unsigned int> wOrigin = {0, 0, 0}; armnn::MergerQueueDescriptor::ViewOrigin window(wOrigin); invalidData.m_ViewOrigins.push_back(window); @@ -268,7 +268,7 @@ BOOST_AUTO_TEST_CASE(MergerQueueDescriptor_Validate_WrongWindow) "match input."); BOOST_CHECK_THROW(RefMergerFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); - // invalid since window extends past the boundary of output tensor + // Invalid, since window extends past the boundary of output tensor. std::vector<unsigned int> wOrigin3 = {0, 0, 15, 0}; armnn::MergerQueueDescriptor::ViewOrigin window3(wOrigin3); invalidData.m_ViewOrigins[0] = window3; @@ -308,17 +308,17 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputNumbers) AddInputToWorkload(invalidData, invalidInfo, input1TensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // too few inputs + // Too few inputs. BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); AddInputToWorkload(invalidData, invalidInfo, input2TensorInfo, nullptr); - // correct + // Correct. BOOST_CHECK_NO_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo)); AddInputToWorkload(invalidData, invalidInfo, input3TensorInfo, nullptr); - // too many inputs + // Too many inputs. BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } @@ -331,7 +331,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes) unsigned int shape1[] = {1, 1, 2, 1}; unsigned int shape2[] = {1, 1, 3, 2}; - // Incompatible shapes even with broadcasting + // Incompatible shapes even with broadcasting. { input1TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32); input2TensorInfo = armnn::TensorInfo(4, shape2, armnn::DataType::Float32); @@ -347,7 +347,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes) BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } - // Output size not compatible with input sizes + // Output size not compatible with input sizes. { input1TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32); input2TensorInfo = armnn::TensorInfo(4, shape1, armnn::DataType::Float32); @@ -360,7 +360,7 @@ BOOST_AUTO_TEST_CASE(AdditionQueueDescriptor_Validate_InputShapes) AddInputToWorkload(invalidData, invalidInfo, input2TensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // output differs + // Output differs. BOOST_CHECK_THROW(RefAdditionFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } } @@ -374,7 +374,7 @@ BOOST_AUTO_TEST_CASE(MultiplicationQueueDescriptor_Validate_InputTensorDimension constexpr unsigned int input0Shape[] = { 2, 2, 4, 4 }; constexpr std::size_t dimensionCount = std::extent<decltype(input0Shape)>::value; - // Check dimension consistency for input tensors + // Checks dimension consistency for input tensors. for (unsigned int dimIndex = 0; dimIndex < dimensionCount; ++dimIndex) { unsigned int input1Shape[dimensionCount]; @@ -399,7 +399,7 @@ BOOST_AUTO_TEST_CASE(MultiplicationQueueDescriptor_Validate_InputTensorDimension BOOST_CHECK_THROW(RefMultiplicationFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } - // Check dimension consistency for input and output tensors + // Checks dimension consistency for input and output tensors. for (unsigned int dimIndex = 0; dimIndex < dimensionCount; ++dimIndex) { unsigned int outputShape[dimensionCount]; @@ -430,7 +430,7 @@ BOOST_AUTO_TEST_CASE(ReshapeQueueDescriptor_Validate_MismatchingNumElements) armnn::TensorInfo inputTensorInfo; armnn::TensorInfo outputTensorInfo; - // The input and output shapes should have the same number of elements, but these don't + // The input and output shapes should have the same number of elements, but these don't. unsigned int inputShape[] = { 1, 1, 2, 3 }; unsigned int outputShape[] = { 1, 1, 1, 2 }; @@ -443,8 +443,29 @@ BOOST_AUTO_TEST_CASE(ReshapeQueueDescriptor_Validate_MismatchingNumElements) AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); - // InvalidArgumentException is expected, because the number of elements don't match + // InvalidArgumentException is expected, because the number of elements don't match. BOOST_CHECK_THROW(RefReshapeFloat32Workload(invalidData, invalidInfo), armnn::InvalidArgumentException); } + +BOOST_AUTO_TEST_CASE(LstmQueueDescriptor_Validate) +{ + armnn::TensorInfo inputTensorInfo; + armnn::TensorInfo outputTensorInfo; + + unsigned int inputShape[] = { 1, 2 }; + unsigned int outputShape[] = { 1 }; + + inputTensorInfo = armnn::TensorInfo(2, inputShape, armnn::DataType::Float32); + outputTensorInfo = armnn::TensorInfo(1, outputShape, armnn::DataType::Float32); + + LstmQueueDescriptor invalidData; + WorkloadInfo invalidInfo; + + AddInputToWorkload(invalidData, invalidInfo, inputTensorInfo, nullptr); + AddOutputToWorkload(invalidData, invalidInfo, outputTensorInfo, nullptr); + + BOOST_CHECK_THROW(invalidData.Validate(invalidInfo), armnn::InvalidArgumentException); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/layers/ActivationLayer.cpp b/src/armnn/layers/ActivationLayer.cpp index 2371eaa97c..ad1e4a9eba 100644 --- a/src/armnn/layers/ActivationLayer.cpp +++ b/src/armnn/layers/ActivationLayer.cpp @@ -30,12 +30,16 @@ ActivationLayer* ActivationLayer::Clone(Graph& graph) const void ActivationLayer::ValidateTensorShapesFromInputs() { - auto& info = GetInputSlot(0).GetConnection()->GetTensorInfo(); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual<LayerValidationException>( "ActivationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - info.GetShape()); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/AdditionLayer.cpp b/src/armnn/layers/AdditionLayer.cpp index 85d12eabcb..ab73a918db 100644 --- a/src/armnn/layers/AdditionLayer.cpp +++ b/src/armnn/layers/AdditionLayer.cpp @@ -28,41 +28,51 @@ AdditionLayer* AdditionLayer::Clone(Graph& graph) const return CloneBase<AdditionLayer>(graph, GetName()); } -void AdditionLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> AdditionLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { - auto& input0 = GetInputSlot(0).GetConnection()->GetTensorInfo(); - auto& input1 = GetInputSlot(1).GetConnection()->GetTensorInfo(); + BOOST_ASSERT(inputShapes.size() == 2); + auto& input0 = inputShapes[0]; + auto& input1 = inputShapes[1]; - // Get the max of the inputs + // Get the max of the inputs. BOOST_ASSERT(input0.GetNumDimensions() == input1.GetNumDimensions()); unsigned int numDims = input0.GetNumDimensions(); std::vector<unsigned int> dims(numDims); - // validate inputs are broadcast compatible -#if !NDEBUG for (unsigned int i = 0; i < numDims; i++) { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; + unsigned int dim0 = input0[i]; + unsigned int dim1 = input1[i]; + + // Validates inputs are broadcast compatible. +#if !NDEBUG if (dim0 != dim1) { BOOST_ASSERT_MSG(dim0 == 1 || dim1 == 1, "Dimensions should either match or one should be of size 1."); } - } #endif - for (unsigned int i = 0; i < numDims; i++) - { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; dims[i] = std::max(dim0, dim1); } - TensorShape outShape(numDims, dims.data()); + return std::vector<TensorShape>({ TensorShape(numDims, dims.data()) }); +} + +void AdditionLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(2, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape() + }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual<LayerValidationException>( "AdditionLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/AdditionLayer.hpp b/src/armnn/layers/AdditionLayer.hpp index c48c027763..37f0b5c259 100644 --- a/src/armnn/layers/AdditionLayer.hpp +++ b/src/armnn/layers/AdditionLayer.hpp @@ -19,6 +19,8 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; + protected: AdditionLayer(const char* name); ~AdditionLayer() = default; diff --git a/src/armnn/layers/BatchNormalizationLayer.cpp b/src/armnn/layers/BatchNormalizationLayer.cpp index ebb8954ea7..0bf81ebec9 100644 --- a/src/armnn/layers/BatchNormalizationLayer.cpp +++ b/src/armnn/layers/BatchNormalizationLayer.cpp @@ -21,12 +21,19 @@ BatchNormalizationLayer::BatchNormalizationLayer(const armnn::BatchNormalization std::unique_ptr<IWorkload> BatchNormalizationLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Mean != nullptr, "BatchNormalizationLayer: Mean data should not be null."); + BOOST_ASSERT_MSG(m_Variance != nullptr, "BatchNormalizationLayer: Variance data should not be null."); + BOOST_ASSERT_MSG(m_Beta != nullptr, "BatchNormalizationLayer: Beta data should not be null."); + BOOST_ASSERT_MSG(m_Gamma != nullptr, "BatchNormalizationLayer: Gamma data should not be null."); + BatchNormalizationQueueDescriptor descriptor; descriptor.m_Mean = m_Mean.get(); descriptor.m_Variance = m_Variance.get(); descriptor.m_Beta = m_Beta.get(); descriptor.m_Gamma = m_Gamma.get(); + return factory.CreateBatchNormalization(descriptor, PrepInfoAndDesc(descriptor, graph)); } @@ -44,17 +51,22 @@ BatchNormalizationLayer* BatchNormalizationLayer::Clone(Graph& graph) const void BatchNormalizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "BatchNormalizationLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "BatchNormalizationLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); - auto& info = GetInputSlot(0).GetConnection()->GetTensorInfo(); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual<LayerValidationException>( "BatchNormalizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - info.GetShape()); + inferredShapes[0]); + +} + +Layer::ConstantTensors BatchNormalizationLayer::GetConstantTensorsByRef() +{ + return {m_Mean, m_Variance, m_Beta, m_Gamma}; } } // namespace armnn diff --git a/src/armnn/layers/BatchNormalizationLayer.hpp b/src/armnn/layers/BatchNormalizationLayer.hpp index d8082e5e98..9a1b5bccc8 100644 --- a/src/armnn/layers/BatchNormalizationLayer.hpp +++ b/src/armnn/layers/BatchNormalizationLayer.hpp @@ -29,6 +29,8 @@ public: protected: BatchNormalizationLayer(const BatchNormalizationDescriptor& param, const char* name); ~BatchNormalizationLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/ConstantLayer.cpp b/src/armnn/layers/ConstantLayer.cpp index 937d38a31d..2abc595605 100644 --- a/src/armnn/layers/ConstantLayer.cpp +++ b/src/armnn/layers/ConstantLayer.cpp @@ -13,9 +13,8 @@ namespace armnn { -ConstantLayer::ConstantLayer(const std::shared_ptr<ScopedCpuTensorHandle>& input, const char* name) +ConstantLayer::ConstantLayer(const char* name) : Layer(0, 1, LayerType::Constant, name) - , m_LayerOutput(input) { } @@ -29,13 +28,22 @@ std::unique_ptr<IWorkload> ConstantLayer::CreateWorkload(const Graph& graph, ConstantLayer* ConstantLayer::Clone(Graph& graph) const { - // Cloned layers share the same layer output object - return CloneBase<ConstantLayer>(graph, m_LayerOutput, GetName()); + // Cloned layers share the same layer output object. + auto layer = CloneBase<ConstantLayer>(graph, GetName()); + + layer->m_LayerOutput = m_LayerOutput ? std::make_unique<ScopedCpuTensorHandle>(*m_LayerOutput) : nullptr; + + return std::move(layer); +} + +std::vector<TensorShape> ConstantLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const +{ + return std::vector<TensorShape>({ m_LayerOutput->GetTensorInfo().GetShape() }); } void ConstantLayer::ValidateTensorShapesFromInputs() { - // get the output shape from the value of the constant layer + // Get the output shape from the value of the constant layer. TensorShape const& outShape = m_LayerOutput->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual<LayerValidationException>( "ConstantLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", diff --git a/src/armnn/layers/ConstantLayer.hpp b/src/armnn/layers/ConstantLayer.hpp index e8e8d2298c..f215832eae 100644 --- a/src/armnn/layers/ConstantLayer.hpp +++ b/src/armnn/layers/ConstantLayer.hpp @@ -21,12 +21,18 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; + + // Free up the constant source data + void ReleaseConstantData() override {}; + + std::unique_ptr<ScopedCpuTensorHandle> m_LayerOutput; protected: - ConstantLayer(const std::shared_ptr<ScopedCpuTensorHandle>& input, const char* name); + ConstantLayer(const char* name); ~ConstantLayer() = default; -private: - std::shared_ptr<ScopedCpuTensorHandle> m_LayerOutput; + ConstantTensors GetConstantTensorsByRef() override { return {m_LayerOutput}; } + }; } // namespace diff --git a/src/armnn/layers/ConvertFp16ToFp32Layer.cpp b/src/armnn/layers/ConvertFp16ToFp32Layer.cpp new file mode 100644 index 0000000000..80d981c267 --- /dev/null +++ b/src/armnn/layers/ConvertFp16ToFp32Layer.cpp @@ -0,0 +1,48 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "ConvertFp16ToFp32Layer.hpp" +#include "LayerCloneBase.hpp" + +#include <armnn/TypesUtils.hpp> + +#include <backends/WorkloadData.hpp> +#include <backends/WorkloadFactory.hpp> + +namespace armnn +{ + +ConvertFp16ToFp32Layer::ConvertFp16ToFp32Layer(const char* name) + : Layer(1, 1, LayerType::ConvertFp16ToFp32, name) +{ +} + +std::unique_ptr<IWorkload> ConvertFp16ToFp32Layer::CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const +{ + ConvertFp16ToFp32QueueDescriptor descriptor; + return factory.CreateConvertFp16ToFp32(descriptor, PrepInfoAndDesc(descriptor, graph)); +} + +ConvertFp16ToFp32Layer* ConvertFp16ToFp32Layer::Clone(Graph& graph) const +{ + return CloneBase<ConvertFp16ToFp32Layer>(graph, GetName()); +} + +void ConvertFp16ToFp32Layer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + + ConditionalThrowIfNotEqual<LayerValidationException>( + "ConvertFp16ToFp32Layer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); +} + +} // namespace armnn diff --git a/src/armnn/layers/ConvertFp16ToFp32Layer.hpp b/src/armnn/layers/ConvertFp16ToFp32Layer.hpp new file mode 100644 index 0000000000..94f1fb8925 --- /dev/null +++ b/src/armnn/layers/ConvertFp16ToFp32Layer.hpp @@ -0,0 +1,28 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include <Layer.hpp> + +namespace armnn +{ + +class ConvertFp16ToFp32Layer : public Layer +{ +public: + virtual std::unique_ptr<IWorkload> CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const override; + + ConvertFp16ToFp32Layer* Clone(Graph& graph) const override; + + void ValidateTensorShapesFromInputs() override; + +protected: + ConvertFp16ToFp32Layer(const char* name); + ~ConvertFp16ToFp32Layer() = default; +}; + +} // namespace diff --git a/src/armnn/layers/ConvertFp32ToFp16Layer.cpp b/src/armnn/layers/ConvertFp32ToFp16Layer.cpp new file mode 100644 index 0000000000..70d6b668f8 --- /dev/null +++ b/src/armnn/layers/ConvertFp32ToFp16Layer.cpp @@ -0,0 +1,47 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "ConvertFp32ToFp16Layer.hpp" + +#include "LayerCloneBase.hpp" + +#include <armnn/TypesUtils.hpp> +#include <backends/WorkloadData.hpp> +#include <backends/WorkloadFactory.hpp> + +namespace armnn +{ + +ConvertFp32ToFp16Layer::ConvertFp32ToFp16Layer(const char* name) + : Layer(1, 1, LayerType::ConvertFp32ToFp16, name) +{ +} + +std::unique_ptr<IWorkload> ConvertFp32ToFp16Layer::CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const +{ + ConvertFp32ToFp16QueueDescriptor descriptor; + return factory.CreateConvertFp32ToFp16(descriptor, PrepInfoAndDesc(descriptor, graph)); +} + +ConvertFp32ToFp16Layer* ConvertFp32ToFp16Layer::Clone(Graph& graph) const +{ + return CloneBase<ConvertFp32ToFp16Layer>(graph, GetName()); +} + +void ConvertFp32ToFp16Layer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + + ConditionalThrowIfNotEqual<LayerValidationException>( + "ConvertFp32ToFp16Layer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); +} + +} // namespace armnn diff --git a/src/armnn/layers/ConvertFp32ToFp16Layer.hpp b/src/armnn/layers/ConvertFp32ToFp16Layer.hpp new file mode 100644 index 0000000000..5c3883021d --- /dev/null +++ b/src/armnn/layers/ConvertFp32ToFp16Layer.hpp @@ -0,0 +1,27 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include <Layer.hpp> + +namespace armnn +{ + +class ConvertFp32ToFp16Layer : public Layer +{ +public: + virtual std::unique_ptr<IWorkload> CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const override; + + ConvertFp32ToFp16Layer* Clone(Graph& graph) const override; + + void ValidateTensorShapesFromInputs() override; + +protected: + ConvertFp32ToFp16Layer(const char* name); + ~ConvertFp32ToFp16Layer() = default; +}; + +} // namespace diff --git a/src/armnn/layers/Convolution2dLayer.cpp b/src/armnn/layers/Convolution2dLayer.cpp index 3829f129bb..05c25bf3a0 100644 --- a/src/armnn/layers/Convolution2dLayer.cpp +++ b/src/armnn/layers/Convolution2dLayer.cpp @@ -20,11 +20,15 @@ Convolution2dLayer::Convolution2dLayer(const Convolution2dDescriptor& param, con std::unique_ptr<IWorkload> Convolution2dLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "Convolution2dLayer: Weights data should not be null."); + Convolution2dQueueDescriptor descriptor; descriptor.m_Weight = m_Weight.get(); if (m_Param.m_BiasEnabled) { + BOOST_ASSERT_MSG(m_Bias != nullptr, "Convolution2dLayer: Bias data should not be null."); descriptor.m_Bias = m_Bias.get(); } return factory.CreateConvolution2d(descriptor, PrepInfoAndDesc(descriptor, graph)); @@ -33,6 +37,7 @@ std::unique_ptr<IWorkload> Convolution2dLayer::CreateWorkload(const Graph& graph Convolution2dLayer* Convolution2dLayer::Clone(Graph& graph) const { auto layer = CloneBase<Convolution2dLayer>(graph, m_Param, GetName()); + layer->m_Weight = m_Weight ? std::make_unique<ScopedCpuTensorHandle>(*m_Weight) : nullptr; if (layer->m_Param.m_BiasEnabled) @@ -43,17 +48,11 @@ Convolution2dLayer* Convolution2dLayer::Clone(Graph& graph) const return std::move(layer); } -void Convolution2dLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> Convolution2dLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "Convolution2dLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "Convolution2dLayer: TensorInfo must be set on connected OutputSlot."); - - - IOutputSlot* input = GetInputSlot(0).GetConnection(); - const TensorShape& inputShape = input->GetTensorInfo().GetShape(); - const TensorShape filterShape = m_Weight->GetTensorInfo().GetShape(); + BOOST_ASSERT(inputShapes.size() == 2); + const TensorShape& inputShape = inputShapes[0]; + const TensorShape filterShape = inputShapes[1]; // If we support multiple batch dimensions in the future, then this assert will need to change. BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Convolutions will always have 4D input."); @@ -73,11 +72,31 @@ void Convolution2dLayer::ValidateTensorShapesFromInputs() unsigned int outChannels = filterShape[0]; unsigned int outBatchSize = inBatchSize; - TensorShape shapeOut({outBatchSize, outChannels, outHeight, outWidth}); + return std::vector<TensorShape>({ TensorShape({outBatchSize, outChannels, outHeight, outWidth})}); +} + +void Convolution2dLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + // check if we m_Weight data is not nullptr + BOOST_ASSERT_MSG(m_Weight != nullptr, "Convolution2dLayer: Weights data should not be null."); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + m_Weight->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual<LayerValidationException>( "Convolution2dLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - shapeOut); + inferredShapes[0]); +} + +Layer::ConstantTensors Convolution2dLayer::GetConstantTensorsByRef() +{ + return {m_Weight, m_Bias}; } } // namespace armnn diff --git a/src/armnn/layers/Convolution2dLayer.hpp b/src/armnn/layers/Convolution2dLayer.hpp index 4d2c6505d3..8659fe540d 100644 --- a/src/armnn/layers/Convolution2dLayer.hpp +++ b/src/armnn/layers/Convolution2dLayer.hpp @@ -24,9 +24,13 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; + protected: Convolution2dLayer(const Convolution2dDescriptor& param, const char* name); ~Convolution2dLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp index 0442de6c60..471bf015a9 100644 --- a/src/armnn/layers/DepthwiseConvolution2dLayer.cpp +++ b/src/armnn/layers/DepthwiseConvolution2dLayer.cpp @@ -22,11 +22,15 @@ DepthwiseConvolution2dLayer::DepthwiseConvolution2dLayer(const DepthwiseConvolut std::unique_ptr<IWorkload> DepthwiseConvolution2dLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "DepthwiseConvolution2dLayer: Weights data should not be null."); + DepthwiseConvolution2dQueueDescriptor descriptor; descriptor.m_Weight = m_Weight.get(); if (m_Param.m_BiasEnabled) { + BOOST_ASSERT_MSG(m_Bias != nullptr, "DepthwiseConvolution2dLayer: Bias data should not be null."); descriptor.m_Bias = m_Bias.get(); } return factory.CreateDepthwiseConvolution2d(descriptor, PrepInfoAndDesc(descriptor, graph)); @@ -45,16 +49,12 @@ DepthwiseConvolution2dLayer* DepthwiseConvolution2dLayer::Clone(Graph& graph) co return std::move(layer); } -void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> +DepthwiseConvolution2dLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "DepthwiseConvolution2dLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "DepthwiseConvolution2dLayer: TensorInfo must be set on connected OutputSlot."); - - IOutputSlot* input = GetInputSlot(0).GetConnection(); - const TensorShape& inputShape = input->GetTensorInfo().GetShape(); - const TensorShape filterShape = m_Weight->GetTensorInfo().GetShape(); + BOOST_ASSERT(inputShapes.size() == 2); + const TensorShape& inputShape = inputShapes[0]; + const TensorShape filterShape = inputShapes[1]; BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Convolutions will always have 4D input."); @@ -74,12 +74,32 @@ void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs() unsigned int outChannels = filterShape[1]*depthMultiplier; unsigned int outBatchSize = inBatchSize; - TensorShape outShape({outBatchSize, outChannels, outHeight, outWidth}); + return std::vector<TensorShape>({ TensorShape({outBatchSize, outChannels, outHeight, outWidth})}); +} + +void DepthwiseConvolution2dLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "DepthwiseConvolution2dLayer: Weights data should not be null."); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + m_Weight->GetTensorInfo().GetShape() + }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual<LayerValidationException>( - "DepthwiseConvolution2dLayer: " - "TensorShape set on OutputSlot[0] does not match the inferred shape.", + "DepthwiseConvolution2dLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); +} + +Layer::ConstantTensors DepthwiseConvolution2dLayer::GetConstantTensorsByRef() +{ + return {m_Weight, m_Bias}; } } // namespace armnn diff --git a/src/armnn/layers/DepthwiseConvolution2dLayer.hpp b/src/armnn/layers/DepthwiseConvolution2dLayer.hpp index 60691bf73c..e3be152432 100644 --- a/src/armnn/layers/DepthwiseConvolution2dLayer.hpp +++ b/src/armnn/layers/DepthwiseConvolution2dLayer.hpp @@ -24,9 +24,13 @@ public: void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; + protected: DepthwiseConvolution2dLayer(const DepthwiseConvolution2dDescriptor& param, const char* name); ~DepthwiseConvolution2dLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/FakeQuantizationLayer.cpp b/src/armnn/layers/FakeQuantizationLayer.cpp index 24b53b2e37..7bda1c1f78 100644 --- a/src/armnn/layers/FakeQuantizationLayer.cpp +++ b/src/armnn/layers/FakeQuantizationLayer.cpp @@ -32,20 +32,16 @@ FakeQuantizationLayer* FakeQuantizationLayer::Clone(Graph& graph) const void FakeQuantizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "FakeQuantizationLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "FakeQuantizationLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); - IOutputSlot* input = GetInputSlot(0).GetConnection(); + BOOST_ASSERT(inferredShapes.size() == 1); - // input and output shapes are the same - TensorShape const& outShape = input->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual<LayerValidationException>( "FakeQuantizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/FloorLayer.cpp b/src/armnn/layers/FloorLayer.cpp index a9ddcca60c..e88600b354 100644 --- a/src/armnn/layers/FloorLayer.cpp +++ b/src/armnn/layers/FloorLayer.cpp @@ -32,18 +32,16 @@ FloorLayer* FloorLayer::Clone(Graph& graph) const void FloorLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "FloorLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "FloorLayer: TensorInfo must be set on connected OutputSlot."); - - // input and output shapes are the same - IOutputSlot* input = GetInputSlot(0).GetConnection(); - TensorShape const& outShape = input->GetTensorInfo().GetShape(); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual<LayerValidationException>( "FloorLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/FullyConnectedLayer.cpp b/src/armnn/layers/FullyConnectedLayer.cpp index 1597e8c2c3..8b8f010bdb 100644 --- a/src/armnn/layers/FullyConnectedLayer.cpp +++ b/src/armnn/layers/FullyConnectedLayer.cpp @@ -22,11 +22,15 @@ FullyConnectedLayer::FullyConnectedLayer(const FullyConnectedDescriptor& param, std::unique_ptr<IWorkload> FullyConnectedLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { + // on this level constant data should not be released.. + BOOST_ASSERT_MSG(m_Weight != nullptr, "FullyConnectedLayer: Weights data should not be null."); + FullyConnectedQueueDescriptor descriptor; descriptor.m_Weight = m_Weight.get(); if (m_Param.m_BiasEnabled) { + BOOST_ASSERT_MSG(m_Bias != nullptr, "FullyConnectedLayer: Bias data should not be null."); descriptor.m_Bias = m_Bias.get(); } return factory.CreateFullyConnected(descriptor, PrepInfoAndDesc(descriptor, graph)); @@ -45,25 +49,41 @@ FullyConnectedLayer* FullyConnectedLayer::Clone(Graph& graph) const return std::move(layer); } +std::vector<TensorShape> FullyConnectedLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const +{ + BOOST_ASSERT(inputShapes.size() == 2); + const TensorShape& inputShape = inputShapes[0]; + const TensorShape weightShape = inputShapes[1]; + + // Output for FC is [1, w[1]]. + unsigned int batches = inputShape[0]; + unsigned int dimIdx = m_Param.m_TransposeWeightMatrix ? 0 : 1; + + return std::vector<TensorShape>({ TensorShape({batches, weightShape[dimIdx]})}); +} + void FullyConnectedLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "FullyConnectedLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "FullyConnectedLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + // check if we m_Weight data is not nullptr + BOOST_ASSERT_MSG(m_Weight != nullptr, "FullyConnectedLayer: Weights data should not be null."); - TensorShape const& weightShape = m_Weight->GetTensorInfo().GetShape(); + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + m_Weight->GetTensorInfo().GetShape() }); - // output for FC is [1, w[1]] - unsigned int batches = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape()[0]; - unsigned int dimIdx = m_Param.m_TransposeWeightMatrix ? 0 : 1; - TensorShape outShape({batches, weightShape[dimIdx]}); + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual<LayerValidationException>( "FullyConnectedLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); +} + +Layer::ConstantTensors FullyConnectedLayer::GetConstantTensorsByRef() +{ + return {m_Weight, m_Bias}; } } // namespace armnn diff --git a/src/armnn/layers/FullyConnectedLayer.hpp b/src/armnn/layers/FullyConnectedLayer.hpp index 1d6cb7cf8d..6300cafd62 100644 --- a/src/armnn/layers/FullyConnectedLayer.hpp +++ b/src/armnn/layers/FullyConnectedLayer.hpp @@ -23,10 +23,13 @@ public: FullyConnectedLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; protected: FullyConnectedLayer(const FullyConnectedDescriptor& param, const char* name); ~FullyConnectedLayer() = default; + + ConstantTensors GetConstantTensorsByRef() override; }; } // namespace diff --git a/src/armnn/layers/L2NormalizationLayer.cpp b/src/armnn/layers/L2NormalizationLayer.cpp index 07020bfdca..7249bc3b5c 100644 --- a/src/armnn/layers/L2NormalizationLayer.cpp +++ b/src/armnn/layers/L2NormalizationLayer.cpp @@ -32,19 +32,16 @@ L2NormalizationLayer* L2NormalizationLayer::Clone(Graph& graph) const void L2NormalizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "L2NormalizationLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "L2NormalizationLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); - IOutputSlot* input = GetInputSlot(0).GetConnection(); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - // input and output shapes are the same - TensorShape const& outShape = input->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual<LayerValidationException>( "L2NormalizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/LayerWithParameters.hpp b/src/armnn/layers/LayerWithParameters.hpp index e3eb40a273..c071c15c21 100644 --- a/src/armnn/layers/LayerWithParameters.hpp +++ b/src/armnn/layers/LayerWithParameters.hpp @@ -18,7 +18,7 @@ public: const Parameters& GetParameters() const { return m_Param; } /// Helper to serialize the layer parameters to string - /// (currently used in DotSerializer and company) + /// (currently used in DotSerializer and company). void SerializeLayerParameters(ParameterStringifyFunction & fn) const { StringifyLayerParameters<Parameters>::Serialize(fn, m_Param); @@ -37,7 +37,7 @@ protected: ~LayerWithParameters() = default; - /// Helper function to reduce duplication in *Layer::CreateWorkload + /// Helper function to reduce duplication in *Layer::CreateWorkload. template <typename QueueDescriptor> WorkloadInfo PrepInfoAndDesc(QueueDescriptor& descriptor, const Graph& graph) const { @@ -45,7 +45,7 @@ protected: return Layer::PrepInfoAndDesc(descriptor, graph); } - /// The parameters for the layer (not including tensor-valued weights etc.) + /// The parameters for the layer (not including tensor-valued weights etc.). Parameters m_Param; }; diff --git a/src/armnn/layers/LstmLayer.cpp b/src/armnn/layers/LstmLayer.cpp new file mode 100644 index 0000000000..30c41bc9b8 --- /dev/null +++ b/src/armnn/layers/LstmLayer.cpp @@ -0,0 +1,259 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "LstmLayer.hpp" + +#include "LayerCloneBase.hpp" + +#include <armnn/TypesUtils.hpp> +#include <backends/CpuTensorHandle.hpp> +#include <backends/WorkloadFactory.hpp> + +namespace armnn +{ + +LstmLayer::LstmLayer(const LstmDescriptor& param, const char* name) + : LayerWithParameters(3, 4, LayerType::Lstm, param, name) +{ +} + +std::unique_ptr<IWorkload> LstmLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const +{ + LstmQueueDescriptor descriptor; + + // Basic parameters + descriptor.m_InputToForgetWeights = m_BasicParameters.m_InputToForgetWeights.get(); + descriptor.m_InputToCellWeights = m_BasicParameters.m_InputToCellWeights.get(); + descriptor.m_InputToOutputWeights = m_BasicParameters.m_InputToOutputWeights.get(); + descriptor.m_RecurrentToForgetWeights = m_BasicParameters.m_RecurrentToForgetWeights.get(); + descriptor.m_RecurrentToCellWeights = m_BasicParameters.m_RecurrentToCellWeights.get(); + descriptor.m_RecurrentToOutputWeights = m_BasicParameters.m_RecurrentToOutputWeights.get(); + descriptor.m_ForgetGateBias = m_BasicParameters.m_ForgetGateBias.get(); + descriptor.m_CellBias = m_BasicParameters.m_CellBias.get(); + descriptor.m_OutputGateBias = m_BasicParameters.m_OutputGateBias.get(); + + // Cifg parameters + if (!m_Param.m_CifgEnabled) + { + descriptor.m_InputToInputWeights = m_CifgParameters.m_InputToInputWeights.get(); + descriptor.m_RecurrentToInputWeights = m_CifgParameters.m_RecurrentToInputWeights.get(); + descriptor.m_CellToInputWeights = m_CifgParameters.m_CellToInputWeights.get(); + descriptor.m_InputGateBias = m_CifgParameters.m_InputGateBias.get(); + } + + // Projection parameters + if (m_Param.m_ProjectionEnabled) + { + descriptor.m_ProjectionWeights = m_ProjectionParameters.m_ProjectionWeights.get(); + descriptor.m_ProjectionBias = m_ProjectionParameters.m_ProjectionBias.get(); + } + + // Peephole parameters + if (m_Param.m_PeepholeEnabled) + { + descriptor.m_CellToForgetWeights = m_PeepholeParameters.m_CellToForgetWeights.get(); + descriptor.m_CellToOutputWeights = m_PeepholeParameters.m_CellToOutputWeights.get(); + } + return factory.CreateLstm(descriptor, PrepInfoAndDesc(descriptor, graph)); +} + +LstmLayer* LstmLayer::Clone(Graph& graph) const +{ + auto layer = CloneBase<LstmLayer>(graph, m_Param, GetName()); + + layer->m_BasicParameters.m_InputToForgetWeights = m_BasicParameters.m_InputToForgetWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_InputToForgetWeights) + : nullptr; + layer->m_BasicParameters.m_InputToCellWeights = m_BasicParameters.m_InputToCellWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_InputToCellWeights) : nullptr; + layer->m_BasicParameters.m_InputToOutputWeights = m_BasicParameters.m_InputToOutputWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_InputToOutputWeights) : nullptr; + layer->m_BasicParameters.m_RecurrentToForgetWeights = m_BasicParameters.m_RecurrentToForgetWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_RecurrentToForgetWeights) : nullptr; + layer->m_BasicParameters.m_RecurrentToCellWeights = m_BasicParameters.m_RecurrentToCellWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_RecurrentToCellWeights) : nullptr; + layer->m_BasicParameters.m_RecurrentToOutputWeights = m_BasicParameters.m_RecurrentToOutputWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_RecurrentToOutputWeights) : nullptr; + layer->m_BasicParameters.m_ForgetGateBias = m_BasicParameters.m_ForgetGateBias ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_ForgetGateBias) : nullptr; + layer->m_BasicParameters.m_CellBias = m_BasicParameters.m_CellBias ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_CellBias) : nullptr; + layer->m_BasicParameters.m_OutputGateBias = m_BasicParameters.m_OutputGateBias ? + std::make_unique<ScopedCpuTensorHandle>(*m_BasicParameters.m_OutputGateBias) : nullptr; + + if (!m_Param.m_CifgEnabled) + { + layer->m_CifgParameters.m_InputToInputWeights = m_CifgParameters.m_InputToInputWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_CifgParameters.m_InputToInputWeights) : nullptr; + layer->m_CifgParameters.m_RecurrentToInputWeights = m_CifgParameters.m_RecurrentToInputWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_CifgParameters.m_RecurrentToInputWeights) : nullptr; + layer->m_CifgParameters.m_CellToInputWeights = m_CifgParameters.m_CellToInputWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_CifgParameters.m_CellToInputWeights) : nullptr; + layer->m_CifgParameters.m_InputGateBias = m_CifgParameters.m_InputGateBias ? + std::make_unique<ScopedCpuTensorHandle>(*m_CifgParameters.m_InputGateBias) : nullptr; + } + + if (m_Param.m_ProjectionEnabled) + { + layer->m_ProjectionParameters.m_ProjectionWeights = m_ProjectionParameters.m_ProjectionWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_ProjectionParameters.m_ProjectionWeights) : nullptr; + layer->m_ProjectionParameters.m_ProjectionBias = m_ProjectionParameters.m_ProjectionBias ? + std::make_unique<ScopedCpuTensorHandle>(*m_ProjectionParameters.m_ProjectionBias) : nullptr; + } + + if (m_Param.m_PeepholeEnabled) + { + layer->m_PeepholeParameters.m_CellToForgetWeights = m_PeepholeParameters.m_CellToForgetWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_PeepholeParameters.m_CellToForgetWeights) : nullptr; + layer->m_PeepholeParameters.m_CellToOutputWeights = m_PeepholeParameters.m_CellToOutputWeights ? + std::make_unique<ScopedCpuTensorHandle>(*m_PeepholeParameters.m_CellToOutputWeights) : nullptr; + } + + return std::move(layer); +} + +std::vector<TensorShape> LstmLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const +{ + BOOST_ASSERT(inputShapes.size() == 3); + + // Get input values for validation + unsigned int batchSize = inputShapes[0][0]; + unsigned int outputSize = inputShapes[1][1]; + unsigned int numUnits = inputShapes[2][1]; + + std::vector<TensorShape> outShapes; + if (!m_Param.m_CifgEnabled) + { + outShapes.push_back(TensorShape({batchSize, numUnits*3})); + } + else + { + outShapes.push_back(TensorShape({batchSize, numUnits*4})); + } + outShapes.push_back(TensorShape({batchSize, outputSize})); + outShapes.push_back(TensorShape({batchSize, numUnits})); + outShapes.push_back(TensorShape({batchSize, outputSize})); + + return outShapes; +} + +void LstmLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(3, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes( { + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(2).GetConnection()->GetTensorInfo().GetShape()} + ); + + BOOST_ASSERT(inferredShapes.size() == 4); + + // Check if the weights are nullptr + BOOST_ASSERT_MSG(m_BasicParameters.m_InputToForgetWeights != nullptr, + "LstmLayer: m_BasicParameters.m_InputToForgetWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_InputToCellWeights != nullptr, + "LstmLayer: m_BasicParameters.m_InputToCellWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_InputToOutputWeights != nullptr, + "LstmLayer: m_BasicParameters.m_InputToOutputWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_RecurrentToForgetWeights != nullptr, + "LstmLayer: m_BasicParameters.m_RecurrentToForgetWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_RecurrentToCellWeights != nullptr, + "LstmLayer: m_BasicParameters.m_RecurrentToCellWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_RecurrentToOutputWeights != nullptr, + "LstmLayer: m_BasicParameters.m_RecurrentToOutputWeights should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_ForgetGateBias != nullptr, + "LstmLayer: m_BasicParameters.m_ForgetGateBias should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_CellBias != nullptr, + "LstmLayer: m_BasicParameters.m_CellBias should not be null."); + BOOST_ASSERT_MSG(m_BasicParameters.m_OutputGateBias != nullptr, + "LstmLayer: m_BasicParameters.m_OutputGateBias should not be null."); + + if (!m_Param.m_CifgEnabled) + { + BOOST_ASSERT_MSG(m_CifgParameters.m_InputToInputWeights != nullptr, + "LstmLayer: m_CifgParameters.m_InputToInputWeights should not be null."); + BOOST_ASSERT_MSG(m_CifgParameters.m_RecurrentToInputWeights != nullptr, + "LstmLayer: m_CifgParameters.m_RecurrentToInputWeights should not be null."); + BOOST_ASSERT_MSG(m_CifgParameters.m_InputGateBias != nullptr, + "LstmLayer: m_CifgParameters.m_InputGateBias should not be null."); + + ConditionalThrowIfNotEqual<LayerValidationException>( + "LstmLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); + } + else + { + BOOST_ASSERT_MSG(m_CifgParameters.m_InputToInputWeights == nullptr, + "LstmLayer: m_CifgParameters.m_InputToInputWeights should not have a value when CIFG is enabled."); + BOOST_ASSERT_MSG(m_CifgParameters.m_RecurrentToInputWeights == nullptr, + "LstmLayer: m_CifgParameters.m_RecurrentToInputWeights should not have a value when CIFG is enabled."); + BOOST_ASSERT_MSG(m_CifgParameters.m_CellToInputWeights == nullptr, + "LstmLayer: m_CifgParameters.m_CellToInputWeights should not have a value when CIFG is enabled."); + BOOST_ASSERT_MSG(m_CifgParameters.m_InputGateBias == nullptr, + "LstmLayer: m_CifgParameters.m_InputGateBias should not have a value when CIFG is enabled."); + + ConditionalThrowIfNotEqual<LayerValidationException>( + "LstmLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", + GetOutputSlot(0).GetTensorInfo().GetShape(), + inferredShapes[0]); + } + + if (m_Param.m_ProjectionEnabled) + { + BOOST_ASSERT_MSG(m_ProjectionParameters.m_ProjectionWeights != nullptr, + "LstmLayer: m_ProjectionParameters.m_ProjectionWeights should not be null."); + } + + if (m_Param.m_PeepholeEnabled) + { + BOOST_ASSERT_MSG(m_PeepholeParameters.m_CellToForgetWeights != nullptr, + "LstmLayer: m_PeepholeParameters.m_CellToForgetWeights should not be null."); + BOOST_ASSERT_MSG(m_PeepholeParameters.m_CellToOutputWeights != nullptr, + "LstmLayer: m_PeepholeParameters.m_CellToOutputWeights should not be null."); + } + + ConditionalThrowIfNotEqual<LayerValidationException>( + "LstmLayer: TensorShape set on OutputSlot[1] does not match the inferred shape.", + GetOutputSlot(1).GetTensorInfo().GetShape(), + inferredShapes[1]); + ConditionalThrowIfNotEqual<LayerValidationException>( + "LstmLayer: TensorShape set on OutputSlot[2] does not match the inferred shape.", + GetOutputSlot(2).GetTensorInfo().GetShape(), + inferredShapes[2]); + ConditionalThrowIfNotEqual<LayerValidationException>( + "LstmLayer: TensorShape set on OutputSlot[3] does not match the inferred shape.", + GetOutputSlot(3).GetTensorInfo().GetShape(), + inferredShapes[3]); +} + +Layer::ConstantTensors LstmLayer::GetConstantTensorsByRef() +{ + return {m_BasicParameters.m_InputToForgetWeights, + m_BasicParameters.m_InputToCellWeights, + m_BasicParameters.m_InputToOutputWeights, + m_BasicParameters.m_RecurrentToForgetWeights, + m_BasicParameters.m_RecurrentToCellWeights, + m_BasicParameters.m_RecurrentToOutputWeights, + m_BasicParameters.m_ForgetGateBias, + m_BasicParameters.m_CellBias, + m_BasicParameters.m_OutputGateBias, + + // Cifg parameters + m_CifgParameters.m_InputToInputWeights, + m_CifgParameters.m_RecurrentToInputWeights, + m_CifgParameters.m_CellToInputWeights, + m_CifgParameters.m_InputGateBias, + + // Projection parameters + m_ProjectionParameters.m_ProjectionWeights, + m_ProjectionParameters.m_ProjectionBias, + + // Peephole parameters + m_PeepholeParameters.m_CellToForgetWeights, + m_PeepholeParameters.m_CellToOutputWeights}; +} + +} // namespace armnn diff --git a/src/armnn/layers/LstmLayer.hpp b/src/armnn/layers/LstmLayer.hpp new file mode 100644 index 0000000000..7133ad26a5 --- /dev/null +++ b/src/armnn/layers/LstmLayer.hpp @@ -0,0 +1,70 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "LayerWithParameters.hpp" + +namespace armnn +{ + +class ScopedCpuTensorHandle; + +struct LstmOptCifgParameters +{ + std::unique_ptr<ScopedCpuTensorHandle> m_InputToInputWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToInputWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_CellToInputWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_InputGateBias; +}; + +struct LstmOptProjectionParameters +{ + std::unique_ptr<ScopedCpuTensorHandle> m_ProjectionWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_ProjectionBias; +}; + +struct LstmOptPeepholeParameters +{ + std::unique_ptr<ScopedCpuTensorHandle> m_CellToForgetWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_CellToOutputWeights; +}; + +struct LstmBasicParameters +{ + std::unique_ptr<ScopedCpuTensorHandle> m_InputToForgetWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_InputToCellWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_InputToOutputWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToForgetWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToCellWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_RecurrentToOutputWeights; + std::unique_ptr<ScopedCpuTensorHandle> m_ForgetGateBias; + std::unique_ptr<ScopedCpuTensorHandle> m_CellBias; + std::unique_ptr<ScopedCpuTensorHandle> m_OutputGateBias; +}; + +class LstmLayer : public LayerWithParameters<LstmDescriptor> +{ +public: + + LstmBasicParameters m_BasicParameters; + LstmOptCifgParameters m_CifgParameters; + LstmOptProjectionParameters m_ProjectionParameters; + LstmOptPeepholeParameters m_PeepholeParameters; + + virtual std::unique_ptr<IWorkload> CreateWorkload(const Graph& graph, + const IWorkloadFactory& factory) const override; + LstmLayer* Clone(Graph& graph) const override; + + void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; + +protected: + LstmLayer(const LstmDescriptor& param, const char* name); + ~LstmLayer() = default; + + Layer::ConstantTensors GetConstantTensorsByRef() override; +}; + +} // namespace diff --git a/src/armnn/layers/MemCopyLayer.cpp b/src/armnn/layers/MemCopyLayer.cpp index 973a756b21..83f77edf58 100644 --- a/src/armnn/layers/MemCopyLayer.cpp +++ b/src/armnn/layers/MemCopyLayer.cpp @@ -9,6 +9,7 @@ #include <armnn/TypesUtils.hpp> #include <backends/WorkloadData.hpp> #include <backends/WorkloadFactory.hpp> +#include <backends/MemCopyWorkload.hpp> namespace armnn { @@ -26,23 +27,23 @@ MemCopyLayer* MemCopyLayer::Clone(Graph& graph) const std::unique_ptr<IWorkload> MemCopyLayer::CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const { MemCopyQueueDescriptor descriptor; - return factory.CreateMemCopy(descriptor, PrepInfoAndDesc(descriptor, graph)); + + //This is different from other workloads. Does not get created by the workload factory. + return std::make_unique<CopyMemGenericWorkload>(descriptor, PrepInfoAndDesc(descriptor, graph)); } void MemCopyLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "MemCopyLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "MemCopyLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); - IOutputSlot* input = GetInputSlot(0).GetConnection(); + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual<LayerValidationException>( "MemCopyLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - input->GetTensorInfo().GetShape()); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/MergerLayer.cpp b/src/armnn/layers/MergerLayer.cpp index 065fc86a1b..e810b5e0bb 100644 --- a/src/armnn/layers/MergerLayer.cpp +++ b/src/armnn/layers/MergerLayer.cpp @@ -23,7 +23,7 @@ std::unique_ptr<IWorkload> MergerLayer::CreateWorkload(const Graph& graph, const { MergerQueueDescriptor descriptor; - // copy the view origins to the descriptor + // Copies the view origins to the descriptor. descriptor.m_ViewOrigins.reserve(m_Param.GetNumViews()); for (unsigned int i = 0; i < m_Param.GetNumViews(); ++i) { @@ -36,9 +36,9 @@ std::unique_ptr<IWorkload> MergerLayer::CreateWorkload(const Graph& graph, const void MergerLayer::CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory) { - //if sub tensors are supported than the merger + //If sub tensors are supported than the merger //just needs to make sure that the outputs of the prev layer - //are made subtensors of the output of the merger layer + //are made subtensors of the output of the merger layer. m_OutputHandlers[0].CreateTensorHandles(factory); if (factory.SupportsSubTensors()) { @@ -76,33 +76,28 @@ MergerLayer* MergerLayer::Clone(Graph& graph) const return CloneBase<MergerLayer>(graph, m_Param, GetName()); } -void MergerLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> MergerLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { - // Validate Merger layer - ConditionalThrowIfNotEqual<LayerValidationException>( - "MergerLayer: Num Inputs must match num views.", - m_Param.GetNumViews(), - GetNumInputSlots()); + BOOST_ASSERT(inputShapes.size() == m_Param.GetNumViews()); unsigned int numDims = m_Param.GetNumDimensions(); - for (unsigned int i=0; i<GetNumInputSlots(); i++) + for (unsigned int i=0; i< inputShapes.size(); i++) { - auto& inputInfo = GetInputSlot(i).GetConnection()->GetTensorInfo(); + auto& inputShape = inputShapes[i]; - boost::ignore_unused(inputInfo); ConditionalThrowIfNotEqual<LayerValidationException>( "MergerLayer: Num Dimensions must match all inputs.", numDims, - inputInfo.GetNumDimensions()); + inputShape.GetNumDimensions()); } - // Find the bounding box (extents) of all the views + // Finds the bounding box (extents) of all the views. std::vector<unsigned int> extentMin(numDims); std::vector<unsigned int> extentMax(numDims); - for (unsigned int i = 0; i < GetNumInputSlots(); i++) + for (unsigned int i = 0; i < inputShapes.size(); i++) { const uint32_t* origin = m_Param.GetViewOrigin(i); - const armnn::TensorShape& shape = GetInputSlot(i).GetConnection()->GetTensorInfo().GetShape(); + const armnn::TensorShape& shape = inputShapes[i]; for (unsigned int d = 0; d < numDims; d++) { extentMin[d] = std::min(extentMin[d], origin[d]); @@ -110,23 +105,23 @@ void MergerLayer::ValidateTensorShapesFromInputs() } } - // Check that the bounding box starts at the origin + // Checks that the bounding box starts at the origin. if (!std::all_of(extentMin.begin(), extentMin.end(), [](unsigned int s) { return s == 0; })) { throw LayerValidationException("MergerLayer: there is no view that starts at the origin"); } - // Check that there are no overlaps of views (this would lead to undefined output at those locations). - // Check each pair of views against each other - // (and don't bother to check against self, or check the same pair both ways round) - for (unsigned int a = 0; a < GetNumInputSlots(); a++) + // Checks that there are no overlaps of views (this would lead to undefined output at those locations). + // Checks each pair of views against each other + // (and doesn't bother to check against self, or check the same pair both ways round). + for (unsigned int a = 0; a < inputShapes.size(); a++) { const uint32_t* aOrigin = m_Param.GetViewOrigin(a); - const armnn::TensorShape& aShape = GetInputSlot(a).GetConnection()->GetTensorInfo().GetShape(); + const armnn::TensorShape& aShape = inputShapes[a]; for (unsigned int b = 0; b < a; b++) { const uint32_t* bOrigin = m_Param.GetViewOrigin(b); - const armnn::TensorShape& bShape = GetInputSlot(b).GetConnection()->GetTensorInfo().GetShape(); + const armnn::TensorShape& bShape = inputShapes[b]; bool allAxesOverlap = true; for (unsigned int d = 0; d < numDims && allAxesOverlap; d++) @@ -149,13 +144,13 @@ void MergerLayer::ValidateTensorShapesFromInputs() } } - // Check that there are no "holes", i.e. regions of the output which is not covered by a view. + // Checks that there are no "holes", i.e. regions of the output which is not covered by a view. // Because we already checked that there are no overlaps, this can be done simply by checking that // the total 'volume' of the views is the same as the output. unsigned int totalViewsVolume = 0; - for (unsigned int i = 0; i < GetNumInputSlots(); i++) + for (unsigned int i = 0; i < inputShapes.size(); i++) { - totalViewsVolume += GetInputSlot(i).GetConnection()->GetTensorInfo().GetNumElements(); + totalViewsVolume += inputShapes[i].GetNumElements(); } unsigned int outputVolume = 1; for (unsigned int d = 0; d < numDims; d++) @@ -168,11 +163,33 @@ void MergerLayer::ValidateTensorShapesFromInputs() totalViewsVolume, outputVolume); - TensorShape outShape(numDims, extentMax.data()); + return std::vector<TensorShape>({ TensorShape({numDims, extentMax.data()}) }); +} + +void MergerLayer::ValidateTensorShapesFromInputs() +{ + // Validates Merger layer. + ConditionalThrowIfNotEqual<LayerValidationException>( + "MergerLayer: Num Inputs must match num views.", + m_Param.GetNumViews(), + GetNumInputSlots()); + + VerifyLayerConnections(m_Param.GetNumViews(), CHECK_LOCATION()); + + std::vector<TensorShape> inputShapes; + for (uint i = 0; i < GetNumInputSlots(); ++i) + { + inputShapes.push_back(GetInputSlot(i).GetConnection()->GetTensorInfo().GetShape()); + } + + auto inferredShapes = InferOutputShapes(inputShapes); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual<LayerValidationException>( "MergerLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn armnn diff --git a/src/armnn/layers/MergerLayer.hpp b/src/armnn/layers/MergerLayer.hpp index ad94cb5f3a..b6261027d4 100644 --- a/src/armnn/layers/MergerLayer.hpp +++ b/src/armnn/layers/MergerLayer.hpp @@ -19,6 +19,7 @@ public: MergerLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; protected: MergerLayer(const OriginsDescriptor& param, const char* name); diff --git a/src/armnn/layers/MultiplicationLayer.cpp b/src/armnn/layers/MultiplicationLayer.cpp index af40a23007..ed7683da5f 100644 --- a/src/armnn/layers/MultiplicationLayer.cpp +++ b/src/armnn/layers/MultiplicationLayer.cpp @@ -31,41 +31,51 @@ MultiplicationLayer* MultiplicationLayer::Clone(Graph& graph) const return CloneBase<MultiplicationLayer>(graph, GetName()); } -void MultiplicationLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> MultiplicationLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { - auto& input0 = GetInputSlot(0).GetConnection()->GetTensorInfo(); - auto& input1 = GetInputSlot(1).GetConnection()->GetTensorInfo(); + BOOST_ASSERT(inputShapes.size() == 2); + auto& input0 = inputShapes[0]; + auto& input1 = inputShapes[1]; - // Get the max of the inputs + // Get the max of the inputs. BOOST_ASSERT(input0.GetNumDimensions() == input1.GetNumDimensions()); unsigned int numDims = input0.GetNumDimensions(); std::vector<unsigned int> dims(numDims); - // validate inputs are broadcast compatible -#if !NDEBUG for (unsigned int i = 0; i < numDims; i++) { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; + unsigned int dim0 = input0[i]; + unsigned int dim1 = input1[i]; + + // Validates inputs are broadcast compatible. +#if !NDEBUG if (dim0 != dim1) { BOOST_ASSERT_MSG(dim0 == 1 || dim1 == 1, "Dimensions should either match or one should be of size 1."); } - } #endif - for (unsigned int i = 0; i < numDims; i++) - { - unsigned int dim0 = input0.GetShape()[i]; - unsigned int dim1 = input1.GetShape()[i]; dims[i] = std::max(dim0, dim1); } - TensorShape outShape(numDims, dims.data()); + return std::vector<TensorShape>({ TensorShape(numDims, dims.data()) }); +} + +void MultiplicationLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(2, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ + GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(), + GetInputSlot(1).GetConnection()->GetTensorInfo().GetShape() + }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual<LayerValidationException>( "MultiplicationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/MultiplicationLayer.hpp b/src/armnn/layers/MultiplicationLayer.hpp index 48db9f4d01..bbfd1ee694 100644 --- a/src/armnn/layers/MultiplicationLayer.hpp +++ b/src/armnn/layers/MultiplicationLayer.hpp @@ -18,6 +18,7 @@ public: MultiplicationLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; protected: MultiplicationLayer(const char* name); diff --git a/src/armnn/layers/NormalizationLayer.cpp b/src/armnn/layers/NormalizationLayer.cpp index cacd348444..261b16a307 100644 --- a/src/armnn/layers/NormalizationLayer.cpp +++ b/src/armnn/layers/NormalizationLayer.cpp @@ -31,14 +31,16 @@ NormalizationLayer* NormalizationLayer::Clone(Graph& graph) const void NormalizationLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "NormalizationLayer: Input slot must be connected."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - const TensorShape& outShape = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual<LayerValidationException>( "NormalizationLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/OutputLayer.cpp b/src/armnn/layers/OutputLayer.cpp index cadcf2da2f..748f275d74 100644 --- a/src/armnn/layers/OutputLayer.cpp +++ b/src/armnn/layers/OutputLayer.cpp @@ -29,7 +29,7 @@ OutputLayer* OutputLayer::Clone(Graph& graph) const void OutputLayer::ValidateTensorShapesFromInputs() { - // Just validate the input is connected + // Just validates that the input is connected. ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, "OutputLayer: Input slot must be connected."); } diff --git a/src/armnn/layers/PermuteLayer.cpp b/src/armnn/layers/PermuteLayer.cpp index 35692756a1..444de81320 100644 --- a/src/armnn/layers/PermuteLayer.cpp +++ b/src/armnn/layers/PermuteLayer.cpp @@ -31,19 +31,25 @@ PermuteLayer* PermuteLayer::Clone(Graph& graph) const return CloneBase<PermuteLayer>(graph, m_Param, GetName()); } +std::vector<TensorShape> PermuteLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const +{ + BOOST_ASSERT(inputShapes.size() == 1); + const TensorShape& inShape = inputShapes[0]; + return std::vector<TensorShape> ({armnnUtils::Permuted(inShape, m_Param.m_DimMappings)}); +} + void PermuteLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "PermuteLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "PermuteLayer: TensorInfo must be set on connected InputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - const TensorInfo& infoIn = GetInputSlot(0).GetConnection()->GetTensorInfo(); - TensorShape shapeOut = armnnUtils::Permuted(infoIn.GetShape(), m_Param.m_DimMappings); ConditionalThrowIfNotEqual<LayerValidationException>( "PermuteLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - shapeOut); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/PermuteLayer.hpp b/src/armnn/layers/PermuteLayer.hpp index c060a16390..2700dd2c7b 100644 --- a/src/armnn/layers/PermuteLayer.hpp +++ b/src/armnn/layers/PermuteLayer.hpp @@ -18,6 +18,7 @@ public: PermuteLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; const PermutationVector& GetPermutation() const { diff --git a/src/armnn/layers/Pooling2dLayer.cpp b/src/armnn/layers/Pooling2dLayer.cpp index ede37d7604..68049101e7 100644 --- a/src/armnn/layers/Pooling2dLayer.cpp +++ b/src/armnn/layers/Pooling2dLayer.cpp @@ -29,15 +29,10 @@ Pooling2dLayer* Pooling2dLayer::Clone(Graph& graph) const return CloneBase<Pooling2dLayer>(graph, m_Param, GetName()); } -void Pooling2dLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> Pooling2dLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "Pooling2dLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "Pooling2dLayer: TensorInfo must be set on connected InputSlot."); - - IOutputSlot* input = GetInputSlot(0).GetConnection(); - const TensorShape& inputShape = input->GetTensorInfo().GetShape(); + BOOST_ASSERT(inputShapes.size() == 1); + const TensorShape& inputShape = inputShapes[0]; // If we support multiple batch dimensions in the future, then this assert will need to change. BOOST_ASSERT_MSG(inputShape.GetNumDimensions() == 4, "Pooling2dLayer will always have 4D input."); @@ -75,8 +70,8 @@ void Pooling2dLayer::ValidateTensorShapesFromInputs() BOOST_ASSERT_MSG(false, "Unsupported Output Shape Rounding"); } - // Make sure that border operations will start from inside the input and not the padded area - // This is what both Caffe and CL does... + // MakeS sure that border operations will start from inside the input and not the padded area. + // This is what both Caffe and CL do... if ((size - 1)*stride >= inSize + lowPad) { --size; @@ -89,18 +84,25 @@ void Pooling2dLayer::ValidateTensorShapesFromInputs() m_Param.m_PaddingMethod, m_Param.m_OutputShapeRounding); outHeight= CalcSize(inHeight, m_Param.m_PadTop, m_Param.m_PadBottom, m_Param.m_PoolHeight, m_Param.m_StrideY, m_Param.m_PaddingMethod, m_Param.m_OutputShapeRounding); - - } unsigned int outChannels = inChannels; unsigned int outBatchSize = inBatchSize; - TensorShape shapeOut({outBatchSize, outChannels, outHeight, outWidth}); + return std::vector<TensorShape>({ TensorShape({outBatchSize, outChannels, outHeight, outWidth}) }); +} + +void Pooling2dLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual<LayerValidationException>( "Pooling2dLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - shapeOut); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/Pooling2dLayer.hpp b/src/armnn/layers/Pooling2dLayer.hpp index af39dbb5ec..d5950d6ec3 100644 --- a/src/armnn/layers/Pooling2dLayer.hpp +++ b/src/armnn/layers/Pooling2dLayer.hpp @@ -9,19 +9,20 @@ namespace armnn { -class SoftmaxLayer : public LayerWithParameters<SoftmaxDescriptor> +class Pooling2dLayer : public LayerWithParameters<Pooling2dDescriptor> { public: virtual std::unique_ptr<IWorkload> CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const override; - SoftmaxLayer* Clone(Graph& graph) const override; + Pooling2dLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; protected: - SoftmaxLayer(const SoftmaxDescriptor& param, const char* name); - ~SoftmaxLayer() = default; + Pooling2dLayer(const Pooling2dDescriptor& param, const char* name); + ~Pooling2dLayer() = default; }; } // namespace diff --git a/src/armnn/layers/ReshapeLayer.cpp b/src/armnn/layers/ReshapeLayer.cpp index df5d9d5bb0..248a45c491 100644 --- a/src/armnn/layers/ReshapeLayer.cpp +++ b/src/armnn/layers/ReshapeLayer.cpp @@ -30,17 +30,23 @@ ReshapeLayer* ReshapeLayer::Clone(Graph& graph) const return CloneBase<ReshapeLayer>(graph, m_Param, GetName()); } +std::vector<TensorShape> ReshapeLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const +{ + return std::vector<TensorShape>({ m_Param.m_TargetShape }); +} + void ReshapeLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "ReshapeLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "ReshapeLayer: TensorInfo must be set on connected OutputSlot."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ }); + + BOOST_ASSERT(inferredShapes.size() == 1); ConditionalThrowIfNotEqual<LayerValidationException>( "ReshapeLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - m_Param.m_TargetShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/ReshapeLayer.hpp b/src/armnn/layers/ReshapeLayer.hpp index 8a3cf3a698..4435ba9bf8 100644 --- a/src/armnn/layers/ReshapeLayer.hpp +++ b/src/armnn/layers/ReshapeLayer.hpp @@ -18,6 +18,7 @@ public: ReshapeLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; bool IsEqual(const Layer& other) const { diff --git a/src/armnn/layers/ResizeBilinearLayer.cpp b/src/armnn/layers/ResizeBilinearLayer.cpp index 204d5afae8..6477fa375a 100644 --- a/src/armnn/layers/ResizeBilinearLayer.cpp +++ b/src/armnn/layers/ResizeBilinearLayer.cpp @@ -30,23 +30,31 @@ ResizeBilinearLayer* ResizeBilinearLayer::Clone(Graph& graph) const return CloneBase<ResizeBilinearLayer>(graph, m_Param, GetName()); } -void ResizeBilinearLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> ResizeBilinearLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "MemCopyLayer: InputSlot must be connected to an OutputSlot"); - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection()->IsTensorInfoSet(), - "MemCopyLayer: TensorInfo must be set on connected OutputSlot."); + BOOST_ASSERT(inputShapes.size() == 1); + const TensorShape& inputShape = inputShapes[0]; - const TensorShape& inputShape = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(); unsigned int outWidth = m_Param.m_TargetWidth; unsigned int outHeight = m_Param.m_TargetHeight; unsigned int outChannels = inputShape[1]; unsigned int outBatch = inputShape[0]; - TensorShape outShape({outBatch, outChannels, outHeight, outWidth}); + + return std::vector<TensorShape>({ TensorShape({outBatch, outChannels, outHeight, outWidth}) }); +} + +void ResizeBilinearLayer::ValidateTensorShapesFromInputs() +{ + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); + ConditionalThrowIfNotEqual<LayerValidationException>( "ResizeBilinearLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/ResizeBilinearLayer.hpp b/src/armnn/layers/ResizeBilinearLayer.hpp index 2cefedb0b8..e6798ce531 100644 --- a/src/armnn/layers/ResizeBilinearLayer.hpp +++ b/src/armnn/layers/ResizeBilinearLayer.hpp @@ -18,6 +18,7 @@ public: ResizeBilinearLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; protected: ResizeBilinearLayer(const ResizeBilinearDescriptor& param, const char* name); diff --git a/src/armnn/layers/SoftmaxLayer.cpp b/src/armnn/layers/SoftmaxLayer.cpp index 2bd0c1d106..7c42b7a3c9 100644 --- a/src/armnn/layers/SoftmaxLayer.cpp +++ b/src/armnn/layers/SoftmaxLayer.cpp @@ -31,14 +31,16 @@ SoftmaxLayer* SoftmaxLayer::Clone(Graph& graph) const void SoftmaxLayer::ValidateTensorShapesFromInputs() { - ConditionalThrow<LayerValidationException>(GetInputSlot(0).GetConnection() != nullptr, - "SoftmaxLayer: Input slot must be connected."); + VerifyLayerConnections(1, CHECK_LOCATION()); + + auto inferredShapes = InferOutputShapes({ GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape() }); + + BOOST_ASSERT(inferredShapes.size() == 1); - const TensorShape& outShape = GetInputSlot(0).GetConnection()->GetTensorInfo().GetShape(); ConditionalThrowIfNotEqual<LayerValidationException>( "SoftmaxLayer: TensorShape set on OutputSlot[0] does not match the inferred shape.", GetOutputSlot(0).GetTensorInfo().GetShape(), - outShape); + inferredShapes[0]); } } // namespace armnn diff --git a/src/armnn/layers/SoftmaxLayer.hpp b/src/armnn/layers/SoftmaxLayer.hpp index ff60a08a91..af39dbb5ec 100644 --- a/src/armnn/layers/SoftmaxLayer.hpp +++ b/src/armnn/layers/SoftmaxLayer.hpp @@ -9,19 +9,19 @@ namespace armnn { -class Pooling2dLayer : public LayerWithParameters<Pooling2dDescriptor> +class SoftmaxLayer : public LayerWithParameters<SoftmaxDescriptor> { public: virtual std::unique_ptr<IWorkload> CreateWorkload(const Graph& graph, const IWorkloadFactory& factory) const override; - Pooling2dLayer* Clone(Graph& graph) const override; + SoftmaxLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; protected: - Pooling2dLayer(const Pooling2dDescriptor& param, const char* name); - ~Pooling2dLayer() = default; + SoftmaxLayer(const SoftmaxDescriptor& param, const char* name); + ~SoftmaxLayer() = default; }; } // namespace diff --git a/src/armnn/layers/SplitterLayer.cpp b/src/armnn/layers/SplitterLayer.cpp index 630921e4d8..5e737a245e 100644 --- a/src/armnn/layers/SplitterLayer.cpp +++ b/src/armnn/layers/SplitterLayer.cpp @@ -22,7 +22,7 @@ std::unique_ptr<IWorkload> SplitterLayer::CreateWorkload(const Graph& graph, con { SplitterQueueDescriptor descriptor; - // copy the window origins to the descriptor + // Copies the window origins to the descriptor. for (unsigned int i = 0; i < m_Param.GetNumViews(); ++i) { descriptor.m_ViewOrigins.emplace_back( @@ -34,14 +34,14 @@ std::unique_ptr<IWorkload> SplitterLayer::CreateWorkload(const Graph& graph, con void SplitterLayer::CreateTensorHandles(Graph& graph, const IWorkloadFactory& factory) { - //if sub tensors are supported than all the "splitter" need to do is to + //If sub tensors are supported than all the "splitter" need to do is to //set the outputs to be appropriate sub tensors of the input. if (factory.SupportsSubTensors()) { const OutputHandler& outputHandler = GetInputSlots()[0].GetConnectedOutputSlot()->GetOutputHandler(); ITensorHandle* inputData = outputHandler.GetData(); - //create the outputs as subtensors of the input + //Creates the outputs as subtensors of the input. for (unsigned int i = 0; i < m_Param.GetNumViews(); ++i) { m_OutputHandlers[i].SetData(factory.CreateSubTensorHandle(*inputData, @@ -63,18 +63,38 @@ SplitterLayer* SplitterLayer::Clone(Graph& graph) const return CloneBase<SplitterLayer>(graph, m_Param, GetName()); } -void SplitterLayer::ValidateTensorShapesFromInputs() +std::vector<TensorShape> SplitterLayer::InferOutputShapes(const std::vector<TensorShape>& inputShapes) const { + BOOST_ASSERT(inputShapes.size() == m_Param.GetNumViews()); + std::vector<TensorShape> outShapes; //Output shapes must match View shapes. for (unsigned int viewIdx = 0; viewIdx < m_Param.GetNumViews(); viewIdx++) { const uint32_t* sizes = m_Param.GetViewSizes(viewIdx); + outShapes.push_back(TensorShape(m_Param.GetNumDimensions(), sizes)); + } + return outShapes; +} + +void SplitterLayer::ValidateTensorShapesFromInputs() +{ + std::vector<TensorShape> views; + for (unsigned int viewIdx = 0; viewIdx < m_Param.GetNumViews(); viewIdx++) + { + const uint32_t* sizes = m_Param.GetViewSizes(viewIdx); + views.push_back(TensorShape(m_Param.GetNumDimensions(), sizes)); + } + + auto inferredShapes = InferOutputShapes(views); - TensorShape outShape(m_Param.GetNumDimensions(), sizes); + BOOST_ASSERT(inferredShapes.size() == m_Param.GetNumViews()); + + for (unsigned int viewIdx = 0; viewIdx < m_Param.GetNumViews(); viewIdx++) + { ConditionalThrowIfNotEqual<LayerValidationException>( "SplitterLayer: View sizes must match output tensor shapes.", GetOutputSlot(viewIdx).GetTensorInfo().GetShape(), - outShape); + inferredShapes[viewIdx]); } } diff --git a/src/armnn/layers/SplitterLayer.hpp b/src/armnn/layers/SplitterLayer.hpp index 7e5bbd2668..8e361b4d5c 100644 --- a/src/armnn/layers/SplitterLayer.hpp +++ b/src/armnn/layers/SplitterLayer.hpp @@ -19,6 +19,7 @@ public: SplitterLayer* Clone(Graph& graph) const override; void ValidateTensorShapesFromInputs() override; + std::vector<TensorShape> InferOutputShapes(const std::vector<TensorShape>& inputShapes) const override; protected: SplitterLayer(const ViewsDescriptor& param, const char* name); diff --git a/src/armnn/memory/BaseMemoryManager.cpp b/src/armnn/memory/BaseMemoryManager.cpp new file mode 100644 index 0000000000..07f42333d6 --- /dev/null +++ b/src/armnn/memory/BaseMemoryManager.cpp @@ -0,0 +1,125 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "BaseMemoryManager.hpp" + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#include "memory/BlobLifetimeManager.hpp" +#include "memory/PoolManager.hpp" +#include "memory/OffsetLifetimeManager.hpp" +#endif + +#include <boost/polymorphic_cast.hpp> + +namespace armnn +{ + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +BaseMemoryManager::BaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc, + MemoryAffinity memoryAffinity) +{ + // (Re)create the memory manager components. + m_Allocator = std::move(alloc); + + m_IntraLayerMemoryMgr = CreateArmComputeMemoryManager(memoryAffinity); + m_InterLayerMemoryMgr = CreateArmComputeMemoryManager(memoryAffinity); +} + +std::shared_ptr<arm_compute::MemoryManagerOnDemand> +BaseMemoryManager::CreateArmComputeMemoryManager(MemoryAffinity memoryAffinity) +{ + std::shared_ptr<arm_compute::ILifetimeManager> lifetimeManager = nullptr; + + if (memoryAffinity == MemoryAffinity::Buffer) + { + lifetimeManager = std::make_shared<BlobLifetimeManager>(); + } + else + { + lifetimeManager = std::make_shared<OffsetLifetimeManager>(); + } + + auto poolManager = std::make_shared<PoolManager>(); + auto memoryManager = std::make_shared<arm_compute::MemoryManagerOnDemand>(lifetimeManager, poolManager); + + // Set allocator that the memory manager will use + memoryManager->set_allocator(m_Allocator.get()); + + return memoryManager; +} + +void BaseMemoryManager::FinalizeMemoryManager(arm_compute::MemoryManagerOnDemand& memoryManager) +{ + // Number of pools that the manager will create. This specifies how many layers you want to run in parallel + memoryManager.set_num_pools(1); + + // Finalize the memory manager. (Validity checks, memory allocations, etc) + memoryManager.finalize(); +} + +void BaseMemoryManager::Finalize() +{ + BOOST_ASSERT(m_IntraLayerMemoryMgr); + FinalizeMemoryManager(*m_IntraLayerMemoryMgr.get()); + + BOOST_ASSERT(m_InterLayerMemoryMgr); + FinalizeMemoryManager(*m_InterLayerMemoryMgr.get()); +} + +void BaseMemoryManager::Acquire() +{ + // Allocate memory pools for intra-layer memory manager + BOOST_ASSERT(m_IntraLayerMemoryMgr); + IPoolManager* poolManager = boost::polymorphic_downcast<IPoolManager*>(m_IntraLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->AllocatePools(); + + // Allocate memory pools for inter-layer memory manager + BOOST_ASSERT(m_InterLayerMemoryMgr); + poolManager = boost::polymorphic_downcast<IPoolManager*>(m_InterLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->AllocatePools(); + + // Acquire inter-layer memory group. NOTE: This has to come after allocating the pools + BOOST_ASSERT(m_InterLayerMemoryGroup); + m_InterLayerMemoryGroup->acquire(); +} + +void BaseMemoryManager::Release() +{ + // Release inter-layer memory group. NOTE: This has to come before releasing the pools + BOOST_ASSERT(m_InterLayerMemoryGroup); + m_InterLayerMemoryGroup->release(); + + // Release memory pools managed by intra-layer memory manager + BOOST_ASSERT(m_IntraLayerMemoryMgr); + IPoolManager* poolManager = boost::polymorphic_downcast<IPoolManager*>(m_IntraLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->ReleasePools(); + + // Release memory pools managed by inter-layer memory manager + BOOST_ASSERT(m_InterLayerMemoryMgr); + poolManager = boost::polymorphic_downcast<IPoolManager*>(m_InterLayerMemoryMgr->pool_manager()); + BOOST_ASSERT(poolManager); + poolManager->ReleasePools(); +} +#endif + +#ifdef ARMCOMPUTENEON_ENABLED +std::shared_ptr<arm_compute::IMemoryGroup> +NeonMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) +{ + return std::make_shared<arm_compute::MemoryGroup>(memoryManager); +} +#endif + +#ifdef ARMCOMPUTECL_ENABLED +std::shared_ptr<arm_compute::IMemoryGroup> +ClMemoryManager::CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) +{ + return std::make_shared<arm_compute::CLMemoryGroup>(memoryManager); +} +#endif + +} diff --git a/src/armnn/memory/BaseMemoryManager.hpp b/src/armnn/memory/BaseMemoryManager.hpp new file mode 100644 index 0000000000..433d0ea9ad --- /dev/null +++ b/src/armnn/memory/BaseMemoryManager.hpp @@ -0,0 +1,104 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "backends/WorkloadFactory.hpp" + +#ifdef ARMCOMPUTENEON_ENABLED +#include "arm_compute/runtime/MemoryGroup.h" +#endif + +#ifdef ARMCOMPUTECL_ENABLED +#include "arm_compute/runtime/CL/CLMemoryGroup.h" +#endif + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) +#include "arm_compute/runtime/IAllocator.h" +#include "arm_compute/runtime/IMemoryGroup.h" +#include "arm_compute/runtime/MemoryManagerOnDemand.h" +#endif + +namespace armnn +{ + +class BaseMemoryManager +{ +public: + enum class MemoryAffinity + { + Buffer, + Offset + }; + + BaseMemoryManager() { } + virtual ~BaseMemoryManager() { } + +#if defined(ARMCOMPUTENEON_ENABLED) || defined(ARMCOMPUTECL_ENABLED) + + BaseMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc, MemoryAffinity memoryAffinity); + + std::shared_ptr<arm_compute::MemoryManagerOnDemand>& GetIntraLayerManager() { return m_IntraLayerMemoryMgr; } + std::shared_ptr<arm_compute::MemoryManagerOnDemand>& GetInterLayerManager() { return m_InterLayerMemoryMgr; } + std::shared_ptr<arm_compute::IMemoryGroup>& GetInterLayerMemoryGroup() { return m_InterLayerMemoryGroup; } + + void Finalize(); + void Acquire(); + void Release(); + +protected: + + std::unique_ptr<arm_compute::IAllocator> m_Allocator; + std::shared_ptr<arm_compute::MemoryManagerOnDemand> m_IntraLayerMemoryMgr; + std::shared_ptr<arm_compute::MemoryManagerOnDemand> m_InterLayerMemoryMgr; + std::shared_ptr<arm_compute::IMemoryGroup> m_InterLayerMemoryGroup; + + std::shared_ptr<arm_compute::MemoryManagerOnDemand> CreateArmComputeMemoryManager(MemoryAffinity memoryAffinity); + + virtual std::shared_ptr<arm_compute::IMemoryGroup> + CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) = 0; + + void FinalizeMemoryManager(arm_compute::MemoryManagerOnDemand& memoryManager); +#endif +}; + +class NeonMemoryManager : public BaseMemoryManager +{ +public: + NeonMemoryManager() {} + virtual ~NeonMemoryManager() {} + +#ifdef ARMCOMPUTENEON_ENABLED + NeonMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc, MemoryAffinity memoryAffinity) + : BaseMemoryManager(std::move(alloc), memoryAffinity) + { + m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr); + } + +protected: + virtual std::shared_ptr<arm_compute::IMemoryGroup> + CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) override; +#endif +}; + +class ClMemoryManager : public BaseMemoryManager +{ +public: + ClMemoryManager() {} + virtual ~ClMemoryManager() {} + +#ifdef ARMCOMPUTECL_ENABLED + ClMemoryManager(std::unique_ptr<arm_compute::IAllocator> alloc) + : BaseMemoryManager(std::move(alloc), MemoryAffinity::Buffer) + { + m_InterLayerMemoryGroup = CreateMemoryGroup(m_InterLayerMemoryMgr); + } + +protected: + virtual std::shared_ptr<arm_compute::IMemoryGroup> + CreateMemoryGroup(const std::shared_ptr<arm_compute::MemoryManagerOnDemand>& memoryManager) override; +#endif +}; + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/BlobLifetimeManager.cpp b/src/armnn/memory/BlobLifetimeManager.cpp new file mode 100644 index 0000000000..5b085b2f5e --- /dev/null +++ b/src/armnn/memory/BlobLifetimeManager.cpp @@ -0,0 +1,79 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "BlobLifetimeManager.hpp" +#include "BlobMemoryPool.hpp" + +#include "arm_compute/runtime/IMemoryGroup.h" + +#include "boost/assert.hpp" + +#include <algorithm> + +namespace armnn +{ + +BlobLifetimeManager::BlobLifetimeManager() + : m_BlobSizes() +{ +} + +arm_compute::MappingType BlobLifetimeManager::mapping_type() const +{ + return arm_compute::MappingType::BLOBS; +} + +void BlobLifetimeManager::update_blobs_and_mappings() +{ + using namespace arm_compute; + + BOOST_ASSERT(are_all_finalized()); + BOOST_ASSERT(_active_group); + + // Sort free blobs requirements in descending order. + _free_blobs.sort([](const Blob & ba, const Blob & bb) + { + return ba.max_size > bb.max_size; + }); + std::vector<size_t> groupSizes; + std::transform(std::begin(_free_blobs), std::end(_free_blobs), std::back_inserter(groupSizes), [](const Blob & b) + { + return b.max_size; + }); + + // Update blob sizes + size_t max_size = std::max(m_BlobSizes.size(), groupSizes.size()); + m_BlobSizes.resize(max_size, 0); + groupSizes.resize(max_size, 0); + std::transform(std::begin(m_BlobSizes), std::end(m_BlobSizes), std::begin(groupSizes), + std::begin(m_BlobSizes), [](size_t lhs, size_t rhs) + { + return std::max(lhs, rhs); + }); + + // Calculate group mappings + auto& groupMappings = _active_group->mappings(); + unsigned int blobIdx = 0; + + for(auto& freeBlob : _free_blobs) + { + for(auto& boundElementId : freeBlob.bound_elements) + { + BOOST_ASSERT(_active_elements.find(boundElementId) != std::end(_active_elements)); + + Element& boundElement = _active_elements[boundElementId]; + groupMappings[boundElement.handle] = blobIdx; + } + + ++blobIdx; + } +} + +std::unique_ptr<arm_compute::IMemoryPool> BlobLifetimeManager::create_pool(arm_compute::IAllocator* allocator) +{ + BOOST_ASSERT(allocator); + return std::make_unique<BlobMemoryPool>(allocator, m_BlobSizes); +} + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/BlobLifetimeManager.hpp b/src/armnn/memory/BlobLifetimeManager.hpp new file mode 100644 index 0000000000..8bb8b326c4 --- /dev/null +++ b/src/armnn/memory/BlobLifetimeManager.hpp @@ -0,0 +1,35 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/ISimpleLifetimeManager.h" + +namespace armnn +{ + +class BlobLifetimeManager : public arm_compute::ISimpleLifetimeManager +{ +public: + BlobLifetimeManager(); + + BlobLifetimeManager(const BlobLifetimeManager&) = delete; + + BlobLifetimeManager& operator=(const BlobLifetimeManager&) = delete; + + BlobLifetimeManager(BlobLifetimeManager&&) = default; + + BlobLifetimeManager& operator=(BlobLifetimeManager&&) = default; + + std::unique_ptr<arm_compute::IMemoryPool> create_pool(arm_compute::IAllocator* allocator) override; + + arm_compute::MappingType mapping_type() const override; + +private: + void update_blobs_and_mappings() override; + + std::vector<size_t> m_BlobSizes; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/BlobMemoryPool.cpp b/src/armnn/memory/BlobMemoryPool.cpp new file mode 100644 index 0000000000..c9f44a4dc6 --- /dev/null +++ b/src/armnn/memory/BlobMemoryPool.cpp @@ -0,0 +1,88 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "BlobMemoryPool.hpp" + +#include <boost/assert.hpp> + +namespace armnn +{ + +BlobMemoryPool::BlobMemoryPool(arm_compute::IAllocator* allocator, std::vector<size_t> blobSizes) + : m_Allocator(allocator) + , m_Blobs() + , m_BlobSizes(std::move(blobSizes)) + , m_MemoryAllocated(false) +{ + AllocatePool(); +} + +BlobMemoryPool::~BlobMemoryPool() +{ + ReleasePool(); +} + +void BlobMemoryPool::acquire(arm_compute::MemoryMappings& handles) +{ + // Set memory to handlers + for (auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = m_Blobs[handle.second]; + } +} + +void BlobMemoryPool::release(arm_compute::MemoryMappings &handles) +{ + for (auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = nullptr; + } +} + +arm_compute::MappingType BlobMemoryPool::mapping_type() const +{ + return arm_compute::MappingType::BLOBS; +} + +std::unique_ptr<arm_compute::IMemoryPool> BlobMemoryPool::duplicate() +{ + BOOST_ASSERT(m_Allocator); + return std::make_unique<BlobMemoryPool>(m_Allocator, m_BlobSizes); +} + +void BlobMemoryPool::AllocatePool() +{ + if (!m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + + for (const auto& blobSize : m_BlobSizes) + { + m_Blobs.push_back(m_Allocator->allocate(blobSize, 0)); + } + + m_MemoryAllocated = true; + } +} + +void BlobMemoryPool::ReleasePool() +{ + if (m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + + for (auto& blob : m_Blobs) + { + m_Allocator->free(blob); + } + + m_Blobs.clear(); + + m_MemoryAllocated = false; + } +} + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/BlobMemoryPool.hpp b/src/armnn/memory/BlobMemoryPool.hpp new file mode 100644 index 0000000000..b17db2ea65 --- /dev/null +++ b/src/armnn/memory/BlobMemoryPool.hpp @@ -0,0 +1,55 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IMemoryPool.hpp" + +#include "arm_compute/runtime/IAllocator.h" +#include "arm_compute/runtime/Types.h" + +namespace armnn +{ + +/** Blob memory pool */ +class BlobMemoryPool : public IMemoryPool +{ +public: + BlobMemoryPool(arm_compute::IAllocator* allocator, std::vector<size_t> blobSizes); + + ~BlobMemoryPool(); + + BlobMemoryPool(const BlobMemoryPool&) = delete; + + BlobMemoryPool& operator=(const BlobMemoryPool&) = delete; + + BlobMemoryPool(BlobMemoryPool&&) = default; + + BlobMemoryPool& operator=(BlobMemoryPool&&) = default; + + void acquire(arm_compute::MemoryMappings &handles) override; + void release(arm_compute::MemoryMappings &handles) override; + + arm_compute::MappingType mapping_type() const override; + + std::unique_ptr<arm_compute::IMemoryPool> duplicate() override; + + void AllocatePool() override; + void ReleasePool() override; + +private: + /// Allocator to use for internal allocation + arm_compute::IAllocator* m_Allocator; + + /// Vector holding all the memory blobs + std::vector<void*> m_Blobs; + + /// Sizes of each memory blob + std::vector<size_t> m_BlobSizes; + + /// Flag indicating whether memory has been allocated for the pool + bool m_MemoryAllocated; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/IMemoryPool.hpp b/src/armnn/memory/IMemoryPool.hpp new file mode 100644 index 0000000000..8c73b484c4 --- /dev/null +++ b/src/armnn/memory/IMemoryPool.hpp @@ -0,0 +1,22 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/IMemoryPool.h" + +namespace armnn +{ + +class IMemoryPool : public arm_compute::IMemoryPool +{ +public: + /// Allocates memory for the entire pool + virtual void AllocatePool() = 0; + + /// Releases all memory associated with the pool + virtual void ReleasePool() = 0; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/IPoolManager.hpp b/src/armnn/memory/IPoolManager.hpp new file mode 100644 index 0000000000..9b06152538 --- /dev/null +++ b/src/armnn/memory/IPoolManager.hpp @@ -0,0 +1,21 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/IPoolManager.h" + +namespace armnn +{ + +class IPoolManager : public arm_compute::IPoolManager { +public: + // Allocates all pools within the pool manager + virtual void AllocatePools() = 0; + + // Releases all pools within the pool manager + virtual void ReleasePools() = 0; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/OffsetLifetimeManager.cpp b/src/armnn/memory/OffsetLifetimeManager.cpp new file mode 100644 index 0000000000..bcbbb0b793 --- /dev/null +++ b/src/armnn/memory/OffsetLifetimeManager.cpp @@ -0,0 +1,62 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "OffsetLifetimeManager.hpp" +#include "OffsetMemoryPool.hpp" + +#include "arm_compute/runtime/IMemoryGroup.h" + +#include <numeric> + +#include "boost/assert.hpp" + +namespace armnn +{ + +OffsetLifetimeManager::OffsetLifetimeManager() + : m_BlobSize(0) +{ +} + +std::unique_ptr<arm_compute::IMemoryPool> OffsetLifetimeManager::create_pool(arm_compute::IAllocator* allocator) +{ + BOOST_ASSERT(allocator); + return std::make_unique<OffsetMemoryPool>(allocator, m_BlobSize); +} + +arm_compute::MappingType OffsetLifetimeManager::mapping_type() const +{ + return arm_compute::MappingType::OFFSETS; +} + +void OffsetLifetimeManager::update_blobs_and_mappings() +{ + BOOST_ASSERT(are_all_finalized()); + BOOST_ASSERT(_active_group); + + // Update blob size + size_t maxGroupSize = std::accumulate(std::begin(_free_blobs), std::end(_free_blobs), + static_cast<size_t>(0), [](size_t s, const Blob& b) + { + return s + b.max_size; + }); + m_BlobSize = std::max(m_BlobSize, maxGroupSize); + + // Calculate group mappings + auto& groupMappings = _active_group->mappings(); + size_t offset = 0; + for(auto& freeBlob : _free_blobs) + { + for(auto& boundElementId : freeBlob.bound_elements) + { + BOOST_ASSERT(_active_elements.find(boundElementId) != std::end(_active_elements)); + Element& boundElement = _active_elements[boundElementId]; + groupMappings[boundElement.handle] = offset; + } + offset += freeBlob.max_size; + BOOST_ASSERT(offset <= m_BlobSize); + } +} + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/OffsetLifetimeManager.hpp b/src/armnn/memory/OffsetLifetimeManager.hpp new file mode 100644 index 0000000000..d6a5698d95 --- /dev/null +++ b/src/armnn/memory/OffsetLifetimeManager.hpp @@ -0,0 +1,37 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "arm_compute/runtime/ISimpleLifetimeManager.h" + +namespace armnn +{ + +class OffsetLifetimeManager : public arm_compute::ISimpleLifetimeManager +{ +public: + OffsetLifetimeManager(); + + OffsetLifetimeManager(const OffsetLifetimeManager&) = delete; + + OffsetLifetimeManager& operator=(const OffsetLifetimeManager&) = delete; + + OffsetLifetimeManager(OffsetLifetimeManager&&) = default; + + OffsetLifetimeManager& operator=(OffsetLifetimeManager&&) = default; + + std::unique_ptr<arm_compute::IMemoryPool> create_pool(arm_compute::IAllocator* allocator) override; + + arm_compute::MappingType mapping_type() const override; + +private: + void update_blobs_and_mappings() override; + +private: + /// Memory blob size + size_t m_BlobSize; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/OffsetMemoryPool.cpp b/src/armnn/memory/OffsetMemoryPool.cpp new file mode 100644 index 0000000000..cae79c0a86 --- /dev/null +++ b/src/armnn/memory/OffsetMemoryPool.cpp @@ -0,0 +1,84 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "OffsetMemoryPool.hpp" + +#include "boost/assert.hpp" + +#include <algorithm> + +namespace armnn +{ + +OffsetMemoryPool::OffsetMemoryPool(arm_compute::IAllocator* allocator, size_t blobSize) + : m_Allocator(allocator) + , m_Blob() + , m_BlobSize(blobSize) + , m_MemoryAllocated(false) +{ + AllocatePool(); +} + +OffsetMemoryPool::~OffsetMemoryPool() +{ + ReleasePool(); +} + +void OffsetMemoryPool::acquire(arm_compute::MemoryMappings& handles) +{ + BOOST_ASSERT(m_Blob); + + // Set memory to handlers + for(auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = reinterpret_cast<uint8_t*>(m_Blob) + handle.second; + } +} + +void OffsetMemoryPool::release(arm_compute::MemoryMappings &handles) +{ + for(auto& handle : handles) + { + BOOST_ASSERT(handle.first); + *handle.first = nullptr; + } +} + +arm_compute::MappingType OffsetMemoryPool::mapping_type() const +{ + return arm_compute::MappingType::OFFSETS; +} + +std::unique_ptr<arm_compute::IMemoryPool> OffsetMemoryPool::duplicate() +{ + BOOST_ASSERT(m_Allocator); + return std::make_unique<OffsetMemoryPool>(m_Allocator, m_BlobSize); +} + +void OffsetMemoryPool::AllocatePool() +{ + if (!m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + m_Blob = m_Allocator->allocate(m_BlobSize, 0); + + m_MemoryAllocated = true; + } +} + +void OffsetMemoryPool::ReleasePool() +{ + if (m_MemoryAllocated) + { + BOOST_ASSERT(m_Allocator); + + m_Allocator->free(m_Blob); + m_Blob = nullptr; + + m_MemoryAllocated = false; + } +} + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/OffsetMemoryPool.hpp b/src/armnn/memory/OffsetMemoryPool.hpp new file mode 100644 index 0000000000..a0391602fb --- /dev/null +++ b/src/armnn/memory/OffsetMemoryPool.hpp @@ -0,0 +1,54 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IMemoryPool.hpp" + +#include "arm_compute/runtime/IAllocator.h" +#include "arm_compute/runtime/Types.h" + +namespace armnn +{ + +class OffsetMemoryPool : public IMemoryPool +{ +public: + OffsetMemoryPool(arm_compute::IAllocator* allocator, size_t blobSize); + + ~OffsetMemoryPool(); + + OffsetMemoryPool(const OffsetMemoryPool&) = delete; + + OffsetMemoryPool& operator=(const OffsetMemoryPool&) = delete; + + OffsetMemoryPool(OffsetMemoryPool&&) = default; + + OffsetMemoryPool& operator=(OffsetMemoryPool &&) = default; + + void acquire(arm_compute::MemoryMappings& handles) override; + void release(arm_compute::MemoryMappings& handles) override; + + arm_compute::MappingType mapping_type() const override; + + std::unique_ptr<arm_compute::IMemoryPool> duplicate() override; + + void AllocatePool() override; + void ReleasePool() override; + +private: + /// Allocator to use for internal allocation + arm_compute::IAllocator* m_Allocator; + + /// Memory blob + void* m_Blob; + + /// Size of the allocated memory blob + size_t m_BlobSize; + + /// Flag indicating whether memory has been allocated for the pool + bool m_MemoryAllocated; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/PoolManager.cpp b/src/armnn/memory/PoolManager.cpp new file mode 100644 index 0000000000..52cef47476 --- /dev/null +++ b/src/armnn/memory/PoolManager.cpp @@ -0,0 +1,105 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "IMemoryPool.hpp" +#include "PoolManager.hpp" + +#include "boost/assert.hpp" +#include "boost/polymorphic_cast.hpp" + +#include <algorithm> + +namespace armnn +{ + +PoolManager::PoolManager() + : m_FreePools() + , m_OccupiedPools() + , m_Semaphore() + , m_Mutex() +{} + +arm_compute::IMemoryPool *PoolManager::lock_pool() +{ + BOOST_ASSERT_MSG(!(m_FreePools.empty() && m_OccupiedPools.empty()), "Haven't setup any pools"); + + m_Semaphore->wait(); + std::lock_guard<arm_compute::Mutex> lock(m_Mutex); + + BOOST_ASSERT_MSG(!m_FreePools.empty(), "Empty pool must exist as semaphore has been signalled"); + m_OccupiedPools.splice(std::begin(m_OccupiedPools), m_FreePools, std::begin(m_FreePools)); + + return m_OccupiedPools.front().get(); +} + +void PoolManager::unlock_pool(arm_compute::IMemoryPool *pool) +{ + BOOST_ASSERT_MSG(!(m_FreePools.empty() && m_OccupiedPools.empty()), "Haven't setup any pools!"); + + std::lock_guard<arm_compute::Mutex> lock(m_Mutex); + + auto it = std::find_if( + std::begin(m_OccupiedPools), + std::end(m_OccupiedPools), + [pool](const std::unique_ptr<arm_compute::IMemoryPool> &poolIterator) + { + return poolIterator.get() == pool; + } + ); + + BOOST_ASSERT_MSG(it != std::end(m_OccupiedPools), "Pool to be unlocked couldn't be found"); + m_FreePools.splice(std::begin(m_FreePools), m_OccupiedPools, it); + m_Semaphore->signal(); +} + +void PoolManager::register_pool(std::unique_ptr<arm_compute::IMemoryPool> pool) +{ + std::lock_guard<arm_compute::Mutex> lock(m_Mutex); + BOOST_ASSERT_MSG(m_OccupiedPools.empty(), "All pools should be free in order to register a new one"); + + // Set pool + m_FreePools.push_front(std::move(pool)); + + // Update semaphore + m_Semaphore = std::make_unique<arm_compute::Semaphore>(m_FreePools.size()); +} + +size_t PoolManager::num_pools() const +{ + std::lock_guard<arm_compute::Mutex> lock(m_Mutex); + + return m_FreePools.size() + m_OccupiedPools.size(); +} + +void PoolManager::AllocatePools() +{ + std::lock_guard<arm_compute::Mutex> lock(m_Mutex); + + for (auto& pool : m_FreePools) + { + boost::polymorphic_downcast<IMemoryPool*>(pool.get())->AllocatePool(); + } + + for (auto& pool : m_OccupiedPools) + { + boost::polymorphic_downcast<IMemoryPool*>(pool.get())->AllocatePool(); + } +} + +void PoolManager::ReleasePools() +{ + std::lock_guard<arm_compute::Mutex> lock(m_Mutex); + + for (auto& pool : m_FreePools) + { + boost::polymorphic_downcast<IMemoryPool*>(pool.get())->ReleasePool(); + } + + for (auto& pool : m_OccupiedPools) + { + boost::polymorphic_downcast<IMemoryPool*>(pool.get())->ReleasePool(); + } +} + +} //namespace armnn
\ No newline at end of file diff --git a/src/armnn/memory/PoolManager.hpp b/src/armnn/memory/PoolManager.hpp new file mode 100644 index 0000000000..a8a51497aa --- /dev/null +++ b/src/armnn/memory/PoolManager.hpp @@ -0,0 +1,56 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "IPoolManager.hpp" + +#include "arm_compute/runtime/IMemoryPool.h" +#include "arm_compute/core/Error.h" +#include "support/Mutex.h" +#include "support/Semaphore.h" + +#include <cstddef> +#include <list> +#include <memory> + +namespace armnn +{ + +class PoolManager : public IPoolManager +{ +public: + PoolManager(); + + PoolManager(const PoolManager &) = delete; + + PoolManager &operator=(const PoolManager &) = delete; + + PoolManager(PoolManager &&) = default; + + PoolManager &operator=(PoolManager &&) = default; + + arm_compute::IMemoryPool *lock_pool() override; + void unlock_pool(arm_compute::IMemoryPool *pool) override; + void register_pool(std::unique_ptr<arm_compute::IMemoryPool> pool) override; + size_t num_pools() const override; + + void AllocatePools() override; + void ReleasePools() override; + +private: + /// List of free pools + std::list<std::unique_ptr<arm_compute::IMemoryPool>> m_FreePools; + + /// List of occupied pools + std::list<std::unique_ptr<arm_compute::IMemoryPool>> m_OccupiedPools; + + /// Semaphore to control the queues + std::unique_ptr<arm_compute::Semaphore> m_Semaphore; + + /// Mutex to control access to the queues + mutable arm_compute::Mutex m_Mutex; +}; + +} // namespace armnn
\ No newline at end of file diff --git a/src/armnn/optimizations/All.hpp b/src/armnn/optimizations/All.hpp index 70f78d44af..0603d44d31 100644 --- a/src/armnn/optimizations/All.hpp +++ b/src/armnn/optimizations/All.hpp @@ -4,8 +4,11 @@ // #pragma once +#include "ConvertConstants.hpp" #include "OptimizeInversePermutes.hpp" #include "PermuteAsReshape.hpp" #include "OptimizeConsecutiveReshapes.hpp" #include "SquashEqualSiblings.hpp" #include "MovePermuteUp.hpp" +#include "OptimizeInverseConversions.hpp" +#include "ConvertFp32NetworkToFp16.hpp" diff --git a/src/armnn/optimizations/ConvertConstants.hpp b/src/armnn/optimizations/ConvertConstants.hpp new file mode 100644 index 0000000000..d2dd650665 --- /dev/null +++ b/src/armnn/optimizations/ConvertConstants.hpp @@ -0,0 +1,98 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#pragma once + +#include "Optimization.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "Half.hpp" +#include "FloatingPointConverter.hpp" + +namespace armnn +{ +namespace optimizations +{ + +struct Float16ToFloat32 +{ + static void Func(std::unique_ptr<ScopedCpuTensorHandle>& handle) + { + const TensorInfo& info = handle->GetTensorInfo(); + + if (info.GetDataType() == DataType::Float16) + { + std::vector<float> newValues(info.GetNumElements()); + + armnnUtils::FloatingPointConverter::ConvertFloat16To32(handle->GetTensor<Half>(), + info.GetNumElements(), + newValues.data()); + + TensorInfo newInfo(info.GetShape(), DataType::Float32); + ConstTensor newInput(newInfo, newValues); + handle.reset(new ScopedCpuTensorHandle(newInput)); + } + } +}; + +struct Float32ToFloat16 +{ + static void Func(std::unique_ptr<ScopedCpuTensorHandle>& handle) + { + const TensorInfo& info = handle->GetTensorInfo(); + + if (info.GetDataType() == DataType::Float32) + { + std::vector<Half> newValues(info.GetNumElements()); + + armnnUtils::FloatingPointConverter::ConvertFloat32To16(handle->GetTensor<float>(), + info.GetNumElements(), + newValues.data()); + + TensorInfo newInfo(info.GetShape(), DataType::Float16); + ConstTensor newInput(newInfo, newValues); + handle.reset(new ScopedCpuTensorHandle(newInput)); + } + } +}; + +template<typename Converter, typename Predicate> +class ConvertConstants : public Optimization +{ +public: + ConvertConstants() = default; + ConvertConstants(const ConvertConstants&) = default; + virtual ~ConvertConstants() = default; + + void Run(Graph& graph, Layer& layer) const override + { + if (Predicate::Test(layer)) + { + layer.OperateOnConstantTensors(Converter::Func); + } + } +protected: +}; + +struct IsFloat32Layer +{ + static bool Test(const Layer& layer) + { + return layer.GetDataType() == DataType::Float32; + } +}; + +struct IsFloat16Layer +{ + static bool Test(const Layer& layer) + { + return layer.GetDataType() == DataType::Float16; + } +}; + +using ConvertConstantsHalfToFloat = ConvertConstants<Float16ToFloat32, IsFloat32Layer>; +using ConvertConstantsFloatToHalf = ConvertConstants<Float32ToFloat16, IsFloat16Layer>; + +} //namespace optimizations +} //namespace armnn diff --git a/src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp b/src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp new file mode 100644 index 0000000000..a4df05c18a --- /dev/null +++ b/src/armnn/optimizations/ConvertFp32NetworkToFp16.hpp @@ -0,0 +1,80 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "Optimization.hpp" +#include "NetworkUtils.hpp" + +namespace armnn +{ +namespace optimizations +{ + +class ConvertFp32NetworkToFp16Impl +{ +public: + + void Run(Graph& graph, Layer& layer) const + { + if(layer.GetType() == LayerType::Input) + { + // if the outputs of this layer are DataType::Float32 + // add a ConvertFloat32ToFloat16 layer after each of the outputs + if (layer.GetDataType() == DataType::Float32) + { + InsertConvertFp32ToFp16LayersAfter(graph, layer); + } + } + else if (layer.GetType() == LayerType::Output) + { + // if the inputs of this layer are DataType::Float32 + // add a ConvertFloat16ToFloat32 layer before each of the inputs + if (layer.GetDataType() == DataType::Float32) + { + InsertConvertFp16ToFp32LayersBefore(graph, layer); + } + } + else if (layer.GetType() != LayerType::ConvertFp32ToFp16 && layer.GetType() != LayerType::ConvertFp16ToFp32) + { + // if the inputs/outputs of this layer are DataType::Float32 + // change the data type for all inputs and outputs to DataType::Float16 + for (auto&& input = layer.BeginInputSlots(); input != layer.EndInputSlots(); ++input) + { + // if it is connected to OutputSlot of the InputLayer do not change the DataType of connection + // InputSlots of the current layer will be updated when conversion layer is inserted after InputLayer + Layer& base = input->GetConnectedOutputSlot()->GetOwningLayer(); + if (base.GetType() != LayerType::Input) + { + TensorInfo convertInfo = input->GetConnection()->GetTensorInfo(); + if (convertInfo.GetDataType() == DataType::Float32) + { + convertInfo.SetDataType(DataType::Float16); + input->GetConnection()->SetTensorInfo(convertInfo); + } + } + } + + // change outputs to DataType::Float16 + for (auto&& output = layer.BeginOutputSlots(); output != layer.EndOutputSlots(); ++output) + { + TensorInfo convertInfo = output->GetTensorInfo(); + if (convertInfo.GetDataType() == DataType::Float32) + { + convertInfo.SetDataType(DataType::Float16); + output->SetTensorInfo(convertInfo); + } + } + } + } + +protected: + ConvertFp32NetworkToFp16Impl() = default; + ~ConvertFp32NetworkToFp16Impl() = default; +}; + +using Fp32NetworkToFp16Converter = OptimizeForType<Layer, ConvertFp32NetworkToFp16Impl>; + +} // namespace optimizations +} // namespace armnn diff --git a/src/armnn/optimizations/MovePermuteUp.hpp b/src/armnn/optimizations/MovePermuteUp.hpp index 8c59986762..a8e18f5add 100644 --- a/src/armnn/optimizations/MovePermuteUp.hpp +++ b/src/armnn/optimizations/MovePermuteUp.hpp @@ -31,24 +31,24 @@ public: auto permute = boost::polymorphic_downcast<PermuteLayer*>(&connection.GetOwningLayer()); const PermutationVector& perm = permute->GetPermutation(); - // Insert an equivalent permute before every input of the base layer. + // Inserts an equivalent permute before every input of the base layer. for (auto baseInput = base.BeginInputSlots(); baseInput != base.EndInputSlots(); ++baseInput) { - // Insert new permute layer. + // Inserts a new permute layer. const std::string name = std::string("moved_up-") + permute->GetName(); PermuteLayer& permLayer = *graph.InsertNewLayer<PermuteLayer>(*baseInput, perm, name.c_str()); - // Set output tensor info for the new layer. + // Sets output tensor info for the new layer. OutputSlot& parentOutput = *permLayer.GetInputSlot(0).GetConnectedOutputSlot(); const TensorInfo permOutInfo = armnnUtils::Permuted(parentOutput.GetTensorInfo(), perm); permLayer.GetOutputHandler().SetTensorInfo(permOutInfo); } - // Set permuted output tensor info + // Sets permuted output tensor info const TensorInfo& childOutInfo = permute->GetOutputHandler().GetTensorInfo(); base.GetOutputHandler().SetTensorInfo(childOutInfo); - // Bypass permute. It will be removed as it's left unconnected. + // Bypasses permute. It will be removed as it's left unconnected. permute->GetOutputSlot().MoveAllConnections(base.GetOutputSlot()); } } diff --git a/src/armnn/optimizations/Optimization.hpp b/src/armnn/optimizations/Optimization.hpp index f81071891b..ee4f91d842 100644 --- a/src/armnn/optimizations/Optimization.hpp +++ b/src/armnn/optimizations/Optimization.hpp @@ -13,9 +13,10 @@ namespace armnn class Optimization { public: + Optimization() = default; + virtual ~Optimization() = default; virtual void Run(Graph& graph, Layer& base) const = 0; protected: - ~Optimization() = default; }; // Wrappers @@ -44,7 +45,7 @@ protected: ~OptimizeForTypeImpl() = default; }; -/// Specialization that calls Wrapped::Run() for any layer type +/// Specialization that calls Wrapped::Run() for any layer type. template <typename Wrapped> class OptimizeForTypeImpl<Layer, Wrapped> : public armnn::Optimization, public Wrapped { @@ -90,7 +91,7 @@ public: } } - // Remove unconnected children + // Removes unconnected children. for (unsigned int i = 0; i < output->GetNumConnections();) { Layer* child = &output->GetConnection(i)->GetOwningLayer(); diff --git a/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp b/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp index 9a926a57a4..935186d32e 100644 --- a/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp +++ b/src/armnn/optimizations/OptimizeConsecutiveReshapes.hpp @@ -31,19 +31,19 @@ public: if (inInfo.GetShape() != outInfo.GetShape()) { - // Insert equivalent reshape before base layer + // Inserts equivalent reshape before base layer. const std::string name = std::string("merged-") + base.GetName() + std::string("-with-") + child.GetName(); const ReshapeDescriptor descriptor{outInfo.GetShape()}; auto& newReshape = *graph.InsertNewLayer<ReshapeLayer>(base.GetInputSlot(0), descriptor, name.c_str()); - // Set tensor info for new layer + // Sets tensor info for new layer. newReshape.GetOutputHandler().SetTensorInfo(outInfo); - // Reconnect base with original parent + // Reconnects base with original parent. newReshape.GetOutputSlot().MoveAllConnections(*parentOut); - // Parent is now the new layer + // Parent is now the new layer. parentOut = &newReshape.GetOutputSlot(); } - // Move connections in child output to parent layer. + // Moves connections in child output to parent layer. // Child layer will be removed as it's left unconnected. // Base layer will be removed if left unconnected. child.GetOutputSlot().MoveAllConnections(*parentOut); diff --git a/src/armnn/optimizations/OptimizeInverseConversions.hpp b/src/armnn/optimizations/OptimizeInverseConversions.hpp new file mode 100644 index 0000000000..5089d63f2f --- /dev/null +++ b/src/armnn/optimizations/OptimizeInverseConversions.hpp @@ -0,0 +1,44 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#pragma once + +#include "Optimization.hpp" + +namespace armnn +{ +namespace optimizations +{ + +class OptimizeInverseConversionsImpl +{ +public: + /// Run for every connection between two inverse data type conversion layers, i.e. + /// Fp16ToFp32 followed by Fp32ToFp16 or vice-versa. + void Run(Graph& graph, InputSlot& connection) const + { + Layer& base = connection.GetConnectedOutputSlot()->GetOwningLayer(); + Layer& child = connection.GetOwningLayer(); + + BOOST_ASSERT((base.GetType() == LayerType::ConvertFp16ToFp32 && + child.GetType() == LayerType::ConvertFp32ToFp16) || + (base.GetType() == LayerType::ConvertFp32ToFp16 && + child.GetType() == LayerType::ConvertFp16ToFp32)); + + // Bypass both conversion layers + child.GetOutputSlot().MoveAllConnections(*base.GetInputSlot(0).GetConnectedOutputSlot()); + } + +protected: + OptimizeInverseConversionsImpl() = default; + ~OptimizeInverseConversionsImpl() = default; +}; + +using OptimizeInverseConversionsFp16 = + OptimizeForConnection<ConvertFp16ToFp32Layer, ConvertFp32ToFp16Layer, OptimizeInverseConversionsImpl>; +using OptimizeInverseConversionsFp32 = + OptimizeForConnection<ConvertFp32ToFp16Layer, ConvertFp16ToFp32Layer, OptimizeInverseConversionsImpl>; + +} // namespace optimizations +} // namespace armnn diff --git a/src/armnn/optimizations/PermuteAsReshape.hpp b/src/armnn/optimizations/PermuteAsReshape.hpp index a8e4c2df5e..736cd5dc98 100644 --- a/src/armnn/optimizations/PermuteAsReshape.hpp +++ b/src/armnn/optimizations/PermuteAsReshape.hpp @@ -23,7 +23,7 @@ public: const std::string name = std::string("as_reshape-") + permute.GetName(); const ReshapeDescriptor descriptor{outInfo.GetShape()}; - // Insert so layers don't need to be re-sorted + // Inserts NewLayer so layers don't need to be re-sorted. auto reshape = graph.InsertNewLayer<ReshapeLayer>(permute.GetInputSlot(0), descriptor, name.c_str()); reshape->GetOutputHandler().SetTensorInfo(outInfo); diff --git a/src/armnn/optimizations/SquashEqualSiblings.hpp b/src/armnn/optimizations/SquashEqualSiblings.hpp index c5ce28e723..6e0fa78e4e 100644 --- a/src/armnn/optimizations/SquashEqualSiblings.hpp +++ b/src/armnn/optimizations/SquashEqualSiblings.hpp @@ -41,7 +41,7 @@ public: { std::swap(sibling, lowestPriorityChild); } - // Bypass sibling. It will be removed as it's left unconnected. + // Bypasses sibling. It will be removed as it's left unconnected. auto siblingOut = sibling->BeginOutputSlots(); for (auto lowestPriorityChildOut = lowestPriorityChild->BeginOutputSlots(); lowestPriorityChildOut != lowestPriorityChild->EndOutputSlots(); ++lowestPriorityChildOut) diff --git a/src/armnn/test/CreateWorkload.hpp b/src/armnn/test/CreateWorkload.hpp index c3f4b8a1bf..ee0c584b13 100644 --- a/src/armnn/test/CreateWorkload.hpp +++ b/src/armnn/test/CreateWorkload.hpp @@ -22,7 +22,7 @@ namespace using namespace std; -// Calls CreateWorkload for a layer, and checks the returned pointer is of the correct type +// Calls CreateWorkload for a layer, and checks the returned pointer is of the correct type. template<typename Workload> std::unique_ptr<Workload> MakeAndCheckWorkload(Layer& layer, Graph& graph, const IWorkloadFactory& factory) { @@ -30,18 +30,19 @@ std::unique_ptr<Workload> MakeAndCheckWorkload(Layer& layer, Graph& graph, const BOOST_TEST(workload.get() == boost::polymorphic_downcast<Workload*>(workload.get()), "Cannot convert to derived class"); std::string reasonIfUnsupported; + layer.SetComputeDevice(factory.GetCompute()); BOOST_TEST(factory.IsLayerSupported(layer, layer.GetDataType(), reasonIfUnsupported)); return std::unique_ptr<Workload>(static_cast<Workload*>(workload.release())); } -// connects two layers +// Connects two layers. void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0) { from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex)); from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo); } -// helper function to create tensor handlers for workloads, assuming they all use the same factory +// Helper function to create tensor handlers for workloads, assuming they all use the same factory. void CreateTensorHandles(armnn::Graph& graph, armnn::IWorkloadFactory& factory) { for (auto&& layer : graph.TopologicalSort()) @@ -57,11 +58,11 @@ void CreateTensorHandles(armnn::Graph& graph, armnn::IWorkloadFactory& factory) // They return the created workloads so that backend-specific checks can be performed. ///////////////////////////////////////////////////////////////////////////////////////////// -template <typename ActivationWorkload> +template <typename ActivationWorkload, armnn::DataType DataType> std::unique_ptr<ActivationWorkload> CreateActivationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. ActivationDescriptor layerDesc; layerDesc.m_Function = ActivationFunction::Abs; layerDesc.m_A = 3.5f; @@ -69,19 +70,19 @@ std::unique_ptr<ActivationWorkload> CreateActivationWorkloadTest(armnn::IWorkloa ActivationLayer* const layer = graph.AddLayer<ActivationLayer>(layerDesc, "layer"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({1, 1}, ActivationWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({1, 1}, DataType); Connect(input, layer, tensorInfo); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<ActivationWorkload>(*layer, graph, factory); ActivationQueueDescriptor queueDescriptor = workload->GetData(); @@ -91,51 +92,51 @@ std::unique_ptr<ActivationWorkload> CreateActivationWorkloadTest(armnn::IWorkloa BOOST_TEST(queueDescriptor.m_Parameters.m_B == -10.0f); BOOST_TEST((queueDescriptor.m_Parameters.m_Function == ActivationFunction::Abs)); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename AdditionWorkload> +template <typename AdditionWorkload, armnn::DataType DataType> std::unique_ptr<AdditionWorkload> CreateAdditionWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Layer* const layer = graph.AddLayer<AdditionLayer>("layer"); - // create extra layers + // Creates extra layers. Layer* const input1 = graph.AddLayer<InputLayer>(1, "input1"); Layer* const input2 = graph.AddLayer<InputLayer>(2, "input2"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({2, 3}, AdditionWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({2, 3}, DataType); Connect(input1, layer, tensorInfo, 0, 0); Connect(input2, layer, tensorInfo, 0, 1); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<AdditionWorkload>(*layer, graph, factory); AdditionQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 2); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename BatchNormalizationFloat32Workload> +template <typename BatchNormalizationFloat32Workload, armnn::DataType DataType> std::unique_ptr<BatchNormalizationFloat32Workload> CreateBatchNormalizationWorkloadTest( armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. BatchNormalizationDescriptor layerDesc; layerDesc.m_Eps = 0.05f; BatchNormalizationLayer* const layer = graph.AddLayer<BatchNormalizationLayer>(layerDesc, "layer"); - armnn::TensorInfo weightInfo({3}, armnn::DataType::Float32); + armnn::TensorInfo weightInfo({3}, DataType); layer->m_Mean = std::make_unique<ScopedCpuTensorHandle>(weightInfo); layer->m_Variance = std::make_unique<ScopedCpuTensorHandle>(weightInfo); layer->m_Beta = std::make_unique<ScopedCpuTensorHandle>(weightInfo); @@ -145,37 +146,37 @@ std::unique_ptr<BatchNormalizationFloat32Workload> CreateBatchNormalizationWorkl layer->m_Beta->Allocate(); layer->m_Gamma->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({2, 3, 1, 1}, armnn::DataType::Float32); + // Connects up. + armnn::TensorInfo tensorInfo({2, 3, 1, 1}, DataType); Connect(input, layer, tensorInfo); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<BatchNormalizationFloat32Workload>(*layer, graph, factory); BatchNormalizationQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Parameters.m_Eps == 0.05f); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - BOOST_TEST((queueDescriptor.m_Mean->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); - BOOST_TEST((queueDescriptor.m_Variance->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); - BOOST_TEST((queueDescriptor.m_Gamma->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); - BOOST_TEST((queueDescriptor.m_Beta->GetTensorInfo() == TensorInfo({3}, DataType::Float32))); + BOOST_TEST((queueDescriptor.m_Mean->GetTensorInfo() == TensorInfo({3}, DataType))); + BOOST_TEST((queueDescriptor.m_Variance->GetTensorInfo() == TensorInfo({3}, DataType))); + BOOST_TEST((queueDescriptor.m_Gamma->GetTensorInfo() == TensorInfo({3}, DataType))); + BOOST_TEST((queueDescriptor.m_Beta->GetTensorInfo() == TensorInfo({3}, DataType))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename Convolution2dWorkload> +template <typename Convolution2dWorkload, armnn::DataType DataType> std::unique_ptr<Convolution2dWorkload> CreateConvolution2dWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Convolution2dDescriptor layerDesc; layerDesc.m_PadLeft = 3; layerDesc.m_PadRight = 3; @@ -187,24 +188,22 @@ std::unique_ptr<Convolution2dWorkload> CreateConvolution2dWorkloadTest(armnn::IW Convolution2dLayer* const layer = graph.AddLayer<Convolution2dLayer>(layerDesc, "layer"); - layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2, 3, 5, 3}, - Convolution2dWorkload::ms_DataType)); - layer->m_Bias = std::make_unique<ScopedCpuTensorHandle> - (TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType))); + layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2, 3, 5, 3}, DataType)); + layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({2}, GetBiasDataType(DataType))); layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - Connect(input, layer, TensorInfo({2, 3, 8, 16}, Convolution2dWorkload::ms_DataType)); - Connect(layer, output, TensorInfo({2, 2, 2, 10}, Convolution2dWorkload::ms_DataType)); + // Connecst up. + Connect(input, layer, TensorInfo({2, 3, 8, 16}, DataType)); + Connect(layer, output, TensorInfo({2, 2, 2, 10}, DataType)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<Convolution2dWorkload>(*layer, graph, factory); Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -218,20 +217,123 @@ std::unique_ptr<Convolution2dWorkload> CreateConvolution2dWorkloadTest(armnn::IW BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 5, 3}, - Convolution2dWorkload::ms_DataType))); + BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 5, 3}, DataType))); BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == - TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType)))); + TensorInfo({2}, GetBiasDataType(DataType)))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename Convolution2dWorkload> +template <typename LstmWorkload> +std::unique_ptr<LstmWorkload> CreateLstmWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) +{ + // This parameter setting is for withCifgWithPeepholeNoProjection + LstmDescriptor layerDesc; + layerDesc.m_ActivationFunc = 4; + layerDesc.m_ClippingThresCell = 0.0f; + layerDesc.m_ClippingThresProj = 0.0f; + layerDesc.m_CifgEnabled = true; + layerDesc.m_PeepholeEnabled = true; + layerDesc.m_ProjectionEnabled = false; + + LstmLayer* const layer = graph.AddLayer<LstmLayer>(layerDesc, "layer"); + unsigned int batchSize = 2; + unsigned int inputSize = 2; + unsigned int numUnits = 4; + unsigned int outputSize = 4; + + layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_InputToCellWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_ForgetGateBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_BasicParameters.m_CellBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_BasicParameters.m_OutputGateBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + + layer->m_BasicParameters.m_InputToForgetWeights->Allocate(); + layer->m_BasicParameters.m_InputToCellWeights->Allocate(); + layer->m_BasicParameters.m_InputToOutputWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToForgetWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToCellWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToOutputWeights->Allocate(); + layer->m_BasicParameters.m_ForgetGateBias->Allocate(); + layer->m_BasicParameters.m_CellBias->Allocate(); + layer->m_BasicParameters.m_OutputGateBias->Allocate(); + + + if (layerDesc.m_PeepholeEnabled) + { + layer->m_PeepholeParameters.m_CellToForgetWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_PeepholeParameters.m_CellToOutputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_PeepholeParameters.m_CellToForgetWeights->Allocate(); + layer->m_PeepholeParameters.m_CellToOutputWeights->Allocate(); + } + + // create input and output layers + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const outputStateIn = graph.AddLayer<InputLayer>(1, "outputStateIn"); + Layer* const cellStateIn = graph.AddLayer<InputLayer>(2, "cellStateIn"); + Layer* const scratchBuffer = graph.AddLayer<OutputLayer>(0, "scratchBuffer"); + Layer* const outputStateOut = graph.AddLayer<OutputLayer>(1, "outputStateOut"); + Layer* const cellStateOut = graph.AddLayer<OutputLayer>(2, "cellStateOut"); + Layer* const output = graph.AddLayer<OutputLayer>(3, "output"); + + // connect up + armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32); + armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32); + armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32); + armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32); + if (layerDesc.m_CifgEnabled) + { + lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 }); + } + + Connect(input, layer, lstmTensorInfo1, 0, 0); + Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1); + Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2); + Connect(layer, scratchBuffer, lstmTensorInfoScratchBuff, 0, 0); + Connect(layer, outputStateOut, lstmTensorInfo3, 1, 0); + Connect(layer, cellStateOut, lstmTensorInfo2, 2, 0); + Connect(layer, output, lstmTensorInfo3, 3, 0); + + CreateTensorHandles(graph, factory); + + // make the workload and check it + auto workload = MakeAndCheckWorkload<LstmWorkload>(*layer, graph, factory); + LstmQueueDescriptor queueDescriptor = workload->GetData(); + BOOST_TEST(queueDescriptor.m_Parameters.m_ActivationFunc == 4); + BOOST_TEST(queueDescriptor.m_Parameters.m_ClippingThresCell == 0.0f); + BOOST_TEST(queueDescriptor.m_Parameters.m_ClippingThresProj == 0.0f); + BOOST_TEST(queueDescriptor.m_Inputs.size() == 3); + BOOST_TEST(queueDescriptor.m_Outputs.size() == 4); + + BOOST_TEST((queueDescriptor.m_InputToForgetWeights->GetTensorInfo() == TensorInfo({ numUnits, inputSize }, + DataType::Float32))); + BOOST_TEST((queueDescriptor.m_OutputGateBias->GetTensorInfo() == TensorInfo({ numUnits }, + DataType::Float32))); + BOOST_TEST((queueDescriptor.m_CellBias->GetTensorInfo() == TensorInfo({ numUnits }, DataType::Float32))); + return workload; +} + +template <typename Convolution2dWorkload, armnn::DataType DataType> std::unique_ptr<Convolution2dWorkload> CreateDirectConvolution2dWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Convolution2dDescriptor layerDesc; layerDesc.m_PadLeft = 1; layerDesc.m_PadRight = 1; @@ -243,26 +345,25 @@ std::unique_ptr<Convolution2dWorkload> CreateDirectConvolution2dWorkloadTest(arm Convolution2dLayer* const layer = graph.AddLayer<Convolution2dLayer>(layerDesc, "layer"); - float inputsQScale = Convolution2dWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0; - float outputQScale = Convolution2dWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0; + float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0; + float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0; - layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({ 2, 3, 3, 3 }, - Convolution2dWorkload::ms_DataType, inputsQScale)); + layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({ 2, 3, 3, 3 }, DataType, inputsQScale)); layer->m_Bias = std::make_unique<ScopedCpuTensorHandle> - (TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType), inputsQScale)); + (TensorInfo({2}, GetBiasDataType(DataType), inputsQScale)); layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - Connect(input, layer, TensorInfo({2, 3, 6, 6}, Convolution2dWorkload::ms_DataType, inputsQScale)); - Connect(layer, output, TensorInfo({2, 2, 6, 6}, Convolution2dWorkload::ms_DataType, outputQScale)); + // Connects up. + Connect(input, layer, TensorInfo({2, 3, 6, 6}, DataType, inputsQScale)); + Connect(layer, output, TensorInfo({2, 2, 6, 6}, DataType, outputQScale)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<Convolution2dWorkload>(*layer, graph, factory); Convolution2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -277,11 +378,11 @@ std::unique_ptr<Convolution2dWorkload> CreateDirectConvolution2dWorkloadTest(arm BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({2, 3, 3, 3}, - Convolution2dWorkload::ms_DataType, inputsQScale))); + DataType, inputsQScale))); BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() - == TensorInfo({2}, GetBiasDataType(Convolution2dWorkload::ms_DataType), inputsQScale))); + == TensorInfo({2}, GetBiasDataType(DataType), inputsQScale))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } @@ -289,7 +390,7 @@ template <typename DepthwiseConvolution2dFloat32Workload> std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolution2dWorkloadTest( armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. DepthwiseConvolution2dDescriptor layerDesc; layerDesc.m_PadLeft = 3; layerDesc.m_PadRight = 3; @@ -306,16 +407,16 @@ std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolutio layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up + // Connects up. Connect(input, layer, TensorInfo({2, 3, 8, 16}, armnn::DataType::Float32)); Connect(layer, output, TensorInfo({2, 9, 2, 10}, armnn::DataType::Float32)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<DepthwiseConvolution2dFloat32Workload>(*layer, graph, factory); DepthwiseConvolution2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -332,41 +433,39 @@ std::unique_ptr<DepthwiseConvolution2dFloat32Workload> CreateDepthwiseConvolutio BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({3, 3, 5, 3}, DataType::Float32))); BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == TensorInfo({9}, DataType::Float32))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename FullyConnectedWorkload> +template <typename FullyConnectedWorkload, armnn::DataType DataType> std::unique_ptr<FullyConnectedWorkload> CreateFullyConnectedWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. FullyConnectedDescriptor layerDesc; layerDesc.m_BiasEnabled = true; layerDesc.m_TransposeWeightMatrix = true; FullyConnectedLayer* const layer = graph.AddLayer<FullyConnectedLayer>(layerDesc, "layer"); - float inputsQScale = FullyConnectedWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 1.0f : 0.0; - float outputQScale = FullyConnectedWorkload::ms_DataType == DataType::QuantisedAsymm8 ? 2.0f : 0.0; + float inputsQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 1.0f : 0.0; + float outputQScale = DataType == armnn::DataType::QuantisedAsymm8 ? 2.0f : 0.0; - layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7, 20}, - FullyConnectedWorkload::ms_DataType, inputsQScale, 0)); - layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7}, - GetBiasDataType(FullyConnectedWorkload::ms_DataType), inputsQScale)); + layer->m_Weight = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7, 20}, DataType, inputsQScale, 0)); + layer->m_Bias = std::make_unique<ScopedCpuTensorHandle>(TensorInfo({7}, GetBiasDataType(DataType), inputsQScale)); layer->m_Weight->Allocate(); layer->m_Bias->Allocate(); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - Connect(input, layer, TensorInfo({3, 1, 4, 5}, FullyConnectedWorkload::ms_DataType, inputsQScale)); - Connect(layer, output, TensorInfo({3, 7}, FullyConnectedWorkload::ms_DataType, outputQScale)); + // Connects up. + Connect(input, layer, TensorInfo({3, 1, 4, 5}, DataType, inputsQScale)); + Connect(layer, output, TensorInfo({3, 7}, DataType, outputQScale)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<FullyConnectedWorkload>(*layer, graph, factory); FullyConnectedQueueDescriptor queueDescriptor = workload->GetData(); @@ -375,50 +474,48 @@ std::unique_ptr<FullyConnectedWorkload> CreateFullyConnectedWorkloadTest(armnn:: BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == - TensorInfo({7, 20}, FullyConnectedWorkload::ms_DataType, inputsQScale))); - BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == - TensorInfo({7}, GetBiasDataType(FullyConnectedWorkload::ms_DataType), inputsQScale))); + BOOST_TEST((queueDescriptor.m_Weight->GetTensorInfo() == TensorInfo({7, 20}, DataType, inputsQScale))); + BOOST_TEST((queueDescriptor.m_Bias->GetTensorInfo() == TensorInfo({7}, GetBiasDataType(DataType), inputsQScale))); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename MultiplicationWorkload> +template <typename MultiplicationWorkload, armnn::DataType DataType> std::unique_ptr<MultiplicationWorkload> CreateMultiplicationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Layer* const layer = graph.AddLayer<MultiplicationLayer>("layer"); - // create extra layers + // Creates extra layers. Layer* const input1 = graph.AddLayer<InputLayer>(1, "input1"); Layer* const input2 = graph.AddLayer<InputLayer>(2, "input2"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({2, 3}, MultiplicationWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({2, 3}, DataType); Connect(input1, layer, tensorInfo, 0, 0); Connect(input2, layer, tensorInfo, 0, 1); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<MultiplicationWorkload>(*layer, graph, factory); MultiplicationQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 2); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename NormalizationFloat32Workload> +template <typename NormalizationFloat32Workload, armnn::DataType DataType> std::unique_ptr<NormalizationFloat32Workload> CreateNormalizationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. NormalizationDescriptor layerDesc; layerDesc.m_NormChannelType = NormalizationAlgorithmChannel::Across; layerDesc.m_NormMethodType = NormalizationAlgorithmMethod::LocalBrightness; @@ -429,16 +526,16 @@ std::unique_ptr<NormalizationFloat32Workload> CreateNormalizationWorkloadTest(ar NormalizationLayer* layer = graph.AddLayer<NormalizationLayer>(layerDesc, "layer"); - // create extra layers + // Creatse extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - Connect(input, layer, TensorInfo({3, 5, 5, 1}, armnn::DataType::Float32)); - Connect(layer, output, TensorInfo({3, 5, 5, 1}, armnn::DataType::Float32)); + // Connects up. + Connect(input, layer, TensorInfo({3, 5, 5, 1}, DataType)); + Connect(layer, output, TensorInfo({3, 5, 5, 1}, DataType)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<NormalizationFloat32Workload>(*layer, graph, factory); NormalizationQueueDescriptor queueDescriptor = workload->GetData(); @@ -452,15 +549,15 @@ std::unique_ptr<NormalizationFloat32Workload> CreateNormalizationWorkloadTest(ar BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename Pooling2dWorkload> +template <typename Pooling2dWorkload, armnn::DataType DataType> std::unique_ptr<Pooling2dWorkload> CreatePooling2dWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Pooling2dDescriptor layerDesc; layerDesc.m_PoolType = PoolingAlgorithm::Average; layerDesc.m_PoolWidth = 3; @@ -475,16 +572,16 @@ std::unique_ptr<Pooling2dWorkload> CreatePooling2dWorkloadTest(armnn::IWorkloadF Pooling2dLayer* const layer = graph.AddLayer<Pooling2dLayer>(layerDesc, "layer"); - // create extra layers + // Create extra layers Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - Connect(input, layer, TensorInfo({3, 2, 5, 5}, Pooling2dWorkload::ms_DataType)); - Connect(layer, output, TensorInfo({3, 2, 2, 4}, Pooling2dWorkload::ms_DataType)); + // Connect up + Connect(input, layer, TensorInfo({3, 2, 5, 5}, DataType)); + Connect(layer, output, TensorInfo({3, 2, 2, 4}, DataType)); CreateTensorHandles(graph, factory); - // make the workload and check it + // Make the workload and checks it auto workload = MakeAndCheckWorkload<Pooling2dWorkload>(*layer, graph, factory); Pooling2dQueueDescriptor queueDescriptor = workload->GetData(); @@ -502,70 +599,70 @@ std::unique_ptr<Pooling2dWorkload> CreatePooling2dWorkloadTest(armnn::IWorkloadF BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Return so we can do extra, backend-specific tests return workload; } -template <typename SoftmaxWorkload> +template <typename SoftmaxWorkload, armnn::DataType DataType> std::unique_ptr<SoftmaxWorkload> CreateSoftmaxWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Create the layer we're testing. SoftmaxDescriptor softmaxDescriptor; Layer* const layer = graph.AddLayer<SoftmaxLayer>(softmaxDescriptor, "layer"); - // create extra layers + // Create extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo tensorInfo({4, 1}, SoftmaxWorkload::ms_DataType); + // Connect up + armnn::TensorInfo tensorInfo({4, 1}, DataType); Connect(input, layer, tensorInfo); Connect(layer, output, tensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Make the workload and checks it. auto workload = MakeAndCheckWorkload<SoftmaxWorkload>(*layer, graph, factory); SoftmaxQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Return so we can do extra, backend-specific tests. return workload; } -template<typename SplitterWorkload> +template<typename SplitterWorkload, armnn::DataType DataType> std::unique_ptr<SplitterWorkload> CreateSplitterWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Create the layer we're testing. // NOTE: need three dimensions channels, height/y, width/x because the Compute // library restricts subtensors to have the same x and y dimensions as // their parent tensors, and therefore the origin on the x and y dimension // has to be zero for any view. So we need a third dimension to split... - // NOTE: arguments are: number of views, number of dimensions + // NOTE: arguments are: number of views, number of dimensions. ViewsDescriptor layerDesc(3, 3); - // NOTE: arguments are: view, dimension, value + // NOTE: arguments are: view, dimension, value. layerDesc.SetViewOriginCoord(0, 0, 0); layerDesc.SetViewOriginCoord(1, 0, 1); layerDesc.SetViewOriginCoord(2, 0, 3); Layer* const layer = graph.AddLayer<SplitterLayer>(layerDesc, "layer"); - // add extra layers + // Adds extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output0 = graph.AddLayer<OutputLayer>(0, "output0"); Layer* const output1 = graph.AddLayer<OutputLayer>(1, "output1"); Layer* const output2 = graph.AddLayer<OutputLayer>(2, "output2"); - // connect up - armnn::TensorInfo tensorInfo({5, 7, 7}, SplitterWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo tensorInfo({5, 7, 7}, DataType); Connect(input, layer, tensorInfo); - armnn::TensorInfo output0Info({1, 7, 7}, SplitterWorkload::ms_DataType); - armnn::TensorInfo output1Info({2, 7, 7}, SplitterWorkload::ms_DataType); - armnn::TensorInfo output2Info({2, 7, 7}, SplitterWorkload::ms_DataType); + armnn::TensorInfo output0Info({1, 7, 7}, DataType); + armnn::TensorInfo output1Info({2, 7, 7}, DataType); + armnn::TensorInfo output2Info({2, 7, 7}, DataType); Connect(layer, output0, output0Info, 0, 0); Connect(layer, output1, output1Info, 1, 0); @@ -573,7 +670,7 @@ std::unique_ptr<SplitterWorkload> CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<SplitterWorkload>(*layer, graph, factory); SplitterQueueDescriptor queueDescriptor = workload->GetData(); @@ -591,24 +688,21 @@ std::unique_ptr<SplitterWorkload> BOOST_TEST(queueDescriptor.m_ViewOrigins[1].m_Origin[2] == 0); BOOST_TEST(queueDescriptor.m_ViewOrigins[2].m_Origin[2] == 0); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -/// This function constructs a graph with both a splitter and a merger, and returns a pair of the workloads -template<typename SplitterWorkload, typename MergerWorkload> +/// This function constructs a graph with both a splitter and a merger, and returns a pair of the workloads. +template<typename SplitterWorkload, typename MergerWorkload, armnn::DataType DataType> std::pair<std::unique_ptr<SplitterWorkload>, std::unique_ptr<MergerWorkload>> CreateSplitterMergerWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - static_assert(SplitterWorkload::ms_DataType == MergerWorkload::ms_DataType, - "Splitter and merger workloads must have the same data type"); + armnn::TensorInfo inputTensorInfo({ 1, 2, 100, 10 }, DataType); - armnn::TensorInfo inputTensorInfo({ 1, 2, 100, 10 }, SplitterWorkload::ms_DataType); + armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 10 }, DataType); + armnn::TensorInfo splitTensorInfo2({ 1, 1, 100, 10 }, DataType); - armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 10 }, SplitterWorkload::ms_DataType); - armnn::TensorInfo splitTensorInfo2({ 1, 1, 100, 10 }, SplitterWorkload::ms_DataType); - - //construct the graph + //Constructs the graph. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); armnn::ViewsDescriptor splitterViews(2); @@ -641,12 +735,12 @@ std::pair<std::unique_ptr<SplitterWorkload>, std::unique_ptr<MergerWorkload>> Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // add connections + // Adds connections. Connect(input, splitter, inputTensorInfo, 0, 0); BOOST_TEST_CHECKPOINT("connect input to splitter"); - Connect(splitter, merger, splitTensorInfo1, 0, 1); // The splitter & merger are connected up + Connect(splitter, merger, splitTensorInfo1, 0, 1); // The splitter & merger are connected up. BOOST_TEST_CHECKPOINT("connect splitter[0] to merger[1]"); - Connect(splitter, merger, splitTensorInfo2, 1, 0); // so that the outputs are flipped round + Connect(splitter, merger, splitTensorInfo2, 1, 0); // So that the outputs are flipped round. BOOST_TEST_CHECKPOINT("connect splitter[1] to merger[0]"); Connect(merger, output, inputTensorInfo, 0, 0); BOOST_TEST_CHECKPOINT("connect merger to output"); @@ -665,7 +759,7 @@ std::pair<std::unique_ptr<SplitterWorkload>, std::unique_ptr<MergerWorkload>> /// This function constructs a graph with a splitter with two outputs. Each of the outputs is then /// connected to two different activation layers -template<typename SplitterWorkload, typename ActivationWorkload> +template<typename SplitterWorkload, typename ActivationWorkload, armnn::DataType DataType> void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph, std::unique_ptr<SplitterWorkload>& wlSplitter, std::unique_ptr<ActivationWorkload>& wlActiv0_0, @@ -673,14 +767,11 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& std::unique_ptr<ActivationWorkload>& wlActiv1_0, std::unique_ptr<ActivationWorkload>& wlActiv1_1) { - static_assert(SplitterWorkload::ms_DataType == ActivationWorkload::ms_DataType, - "Splitter and activation workloads must have the same data type"); - - armnn::TensorInfo inputTensorInfo ({ 1, 3, 100, 50 }, SplitterWorkload::ms_DataType); - armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 50 }, SplitterWorkload::ms_DataType); - armnn::TensorInfo splitTensorInfo2({ 1, 2, 100, 50 }, SplitterWorkload::ms_DataType); + armnn::TensorInfo inputTensorInfo ({ 1, 3, 100, 50 }, DataType); + armnn::TensorInfo splitTensorInfo1({ 1, 1, 100, 50 }, DataType); + armnn::TensorInfo splitTensorInfo2({ 1, 2, 100, 50 }, DataType); - //construct the graph + //Constructs the graph. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); armnn::ViewsDescriptor splitterViews(2); @@ -709,7 +800,7 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& Layer* const output3 = graph.AddLayer<OutputLayer>(3, "output3"); Layer* const output4 = graph.AddLayer<OutputLayer>(4, "output4"); - // add connections + // Adds connections. Connect(input, splitter, inputTensorInfo, 0, 0); Connect(splitter, activ0_0, splitTensorInfo1, 0, 0); Connect(splitter, activ0_1, splitTensorInfo1, 0, 0); @@ -737,97 +828,155 @@ void CreateSplitterMultipleInputsOneOutputWorkloadTest(armnn::IWorkloadFactory& wlActiv1_1 = std::move(workloadActiv1_1); } -template <typename ResizeBilinearWorkload> +template <typename ResizeBilinearWorkload, armnn::DataType DataType> std::unique_ptr<ResizeBilinearWorkload> CreateResizeBilinearWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. TensorShape outputShape({ 2, 3, 2, 2 }); ResizeBilinearDescriptor resizeDesc; resizeDesc.m_TargetWidth = outputShape[3]; resizeDesc.m_TargetHeight = outputShape[2]; Layer* const layer = graph.AddLayer<ResizeBilinearLayer>(resizeDesc, "layer"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo inputTensorInfo({ 2, 3, 4, 4 }, ResizeBilinearWorkload::ms_DataType); - armnn::TensorInfo outputTensorInfo(outputShape, ResizeBilinearWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo inputTensorInfo({ 2, 3, 4, 4 }, DataType); + armnn::TensorInfo outputTensorInfo(outputShape, DataType); Connect(input, layer, inputTensorInfo); Connect(layer, output, outputTensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<ResizeBilinearWorkload>(*layer, graph, factory); ResizeBilinearQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename L2NormalizationWorkload> +template <typename L2NormalizationWorkload, armnn::DataType DataType> std::unique_ptr<L2NormalizationWorkload> CreateL2NormalizationWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. Layer* const layer = graph.AddLayer<L2NormalizationLayer>("l2norm"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo inputTensorInfo({ 5, 20, 50, 67 }, L2NormalizationWorkload::ms_DataType); - armnn::TensorInfo outputTensorInfo({ 5, 20, 50, 67 }, L2NormalizationWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo inputTensorInfo({ 5, 20, 50, 67 }, DataType); + armnn::TensorInfo outputTensorInfo({ 5, 20, 50, 67 }, DataType); Connect(input, layer, inputTensorInfo); Connect(layer, output, outputTensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<L2NormalizationWorkload>(*layer, graph, factory); L2NormalizationQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. return workload; } -template <typename ReshapeWorkload> +template <typename ReshapeWorkload, armnn::DataType DataType> std::unique_ptr<ReshapeWorkload> CreateReshapeWorkloadTest(armnn::IWorkloadFactory& factory, armnn::Graph& graph) { - // create the layer we're testing + // Creates the layer we're testing. TensorShape outputShape({ 1, 4 }); ReshapeDescriptor reshapeDesc; reshapeDesc.m_TargetShape = outputShape; Layer* const layer = graph.AddLayer<ReshapeLayer>(reshapeDesc, "layer"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up - armnn::TensorInfo inputTensorInfo({ 4, 1 }, ReshapeWorkload::ms_DataType); - armnn::TensorInfo outputTensorInfo(outputShape, ReshapeWorkload::ms_DataType); + // Connects up. + armnn::TensorInfo inputTensorInfo({ 4, 1 }, DataType); + armnn::TensorInfo outputTensorInfo(outputShape, DataType); Connect(input, layer, inputTensorInfo); Connect(layer, output, outputTensorInfo); CreateTensorHandles(graph, factory); - // make the workload and check it + // Makes the workload and checks it. auto workload = MakeAndCheckWorkload<ReshapeWorkload>(*layer, graph, factory); ReshapeQueueDescriptor queueDescriptor = workload->GetData(); BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); - // return so we can do extra, backend-specific tests + // Returns so we can do extra, backend-specific tests. + return workload; +} + +template <typename ConvertFp16ToFp32Float32Workload> +std::unique_ptr<ConvertFp16ToFp32Float32Workload> CreateConvertFp16ToFp32WorkloadTest( + armnn::IWorkloadFactory& factory, armnn::Graph& graph) +{ + // Creates the layer we're testing. + ConvertFp16ToFp32Layer* const layer = graph.AddLayer<ConvertFp16ToFp32Layer>("Fp16ToFp32Converter"); + + // Creates extra layers. + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); + + // Connects up. + armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + Connect(input, layer, inputTensorInfo); + Connect(layer, output, outputTensorInfo); + CreateTensorHandles(graph, factory); + + // Makes the workload and checks it. + auto workload = MakeAndCheckWorkload<ConvertFp16ToFp32Float32Workload>(*layer, graph, factory); + + ConvertFp16ToFp32QueueDescriptor queueDescriptor = workload->GetData(); + BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); + BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); + + // Returns so we can do extra, backend-specific tests. + return workload; +} + +template <typename ConvertFp32ToFp16Float16Workload> +std::unique_ptr<ConvertFp32ToFp16Float16Workload> CreateConvertFp32ToFp16WorkloadTest( + armnn::IWorkloadFactory& factory, armnn::Graph& graph) +{ + // Creates the layer we're testing. + ConvertFp32ToFp16Layer* const layer = graph.AddLayer<ConvertFp32ToFp16Layer>("Fp32ToFp16Converter"); + + // Creates extra layers. + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); + + // Connects up. + armnn::TensorInfo inputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32); + armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16); + Connect(input, layer, inputTensorInfo); + Connect(layer, output, outputTensorInfo); + CreateTensorHandles(graph, factory); + + // Makes the workload and checks it. + auto workload = MakeAndCheckWorkload<ConvertFp32ToFp16Float16Workload>(*layer, graph, factory); + + ConvertFp32ToFp16QueueDescriptor queueDescriptor = workload->GetData(); + BOOST_TEST(queueDescriptor.m_Inputs.size() == 1); + BOOST_TEST(queueDescriptor.m_Outputs.size() == 1); + + // Returns so we can do extra, backend-specific tests. return workload; } diff --git a/src/armnn/test/CreateWorkloadClNeon.hpp b/src/armnn/test/CreateWorkloadClNeon.hpp index a41a70755f..d92111ac41 100644 --- a/src/armnn/test/CreateWorkloadClNeon.hpp +++ b/src/armnn/test/CreateWorkloadClNeon.hpp @@ -56,22 +56,21 @@ boost::test_tools::predicate_result CompareTensorHandleShape(IComputeTensorHandl return true; } -template<template <DataType> class CopyFromCpuWorkload, template <DataType> class CopyToCpuWorkload, - typename IComputeTensorHandle> +template<typename IComputeTensorHandle> void CreateMemCopyWorkloads(IWorkloadFactory& factory) { Graph graph; RefWorkloadFactory refFactory; - // create the layers we're testing + // Creates the layers we're testing. Layer* const layer1 = graph.AddLayer<MemCopyLayer>("layer1"); Layer* const layer2 = graph.AddLayer<MemCopyLayer>("layer2"); - // create extra layers + // Creates extra layers. Layer* const input = graph.AddLayer<InputLayer>(0, "input"); Layer* const output = graph.AddLayer<OutputLayer>(0, "output"); - // connect up + // Connects up. TensorInfo tensorInfo({2, 3}, DataType::Float32); Connect(input, layer1, tensorInfo); Connect(layer1, layer2, tensorInfo); @@ -83,8 +82,8 @@ void CreateMemCopyWorkloads(IWorkloadFactory& factory) output->CreateTensorHandles(graph, refFactory); // make the workloads and check them - auto workload1 = MakeAndCheckWorkload<CopyFromCpuWorkload<DataType::Float32>>(*layer1, graph, factory); - auto workload2 = MakeAndCheckWorkload<CopyToCpuWorkload<DataType::Float32>>(*layer2, graph, refFactory); + auto workload1 = MakeAndCheckWorkload<CopyMemGenericWorkload>(*layer1, graph, factory); + auto workload2 = MakeAndCheckWorkload<CopyMemGenericWorkload>(*layer2, graph, refFactory); MemCopyQueueDescriptor queueDescriptor1 = workload1->GetData(); BOOST_TEST(queueDescriptor1.m_Inputs.size() == 1); @@ -104,4 +103,4 @@ void CreateMemCopyWorkloads(IWorkloadFactory& factory) BOOST_TEST((outputHandle2->GetTensorInfo() == TensorInfo({2, 3}, DataType::Float32))); } -}
\ No newline at end of file +} //namespace
\ No newline at end of file diff --git a/src/armnn/test/CsvReaderTest.cpp b/src/armnn/test/CsvReaderTest.cpp new file mode 100644 index 0000000000..8df61e1fdd --- /dev/null +++ b/src/armnn/test/CsvReaderTest.cpp @@ -0,0 +1,124 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include "CsvReader.hpp" + +#include <boost/algorithm/string.hpp> +#include <boost/test/unit_test.hpp> + +#include <iostream> +#include <string> +#include <boost/filesystem.hpp> + +using namespace armnnUtils; + +struct TestHelper { + + TestHelper() + { + BOOST_TEST_MESSAGE("setup fixture"); + } + + ~TestHelper() + { + BOOST_TEST_MESSAGE("teardown fixture"); + TearDown(); + } + + std::string CreateTempCsvFile() + { + std::string fileDir = boost::filesystem::temp_directory_path().c_str(); + boost::filesystem::path p{fileDir + "/sampleFile.csv"}; + try + { + boost::filesystem::ofstream ofs{p}; + ofs << "airplane, bicycle , bird , \"m,o,n,k,e,y\"\n"; + ofs << "banana, shoe, \"ice\""; + ofs.close(); + } catch (std::exception &e) + { + std::cerr << "Unable to write to file at location [" << p.c_str() << "] : " << e.what() << std::endl; + BOOST_TEST(false); + } + return fileDir + "/sampleFile.csv"; + } + + int CheckStringsMatch(CsvRow &row, unsigned int index, std::string expectedValue) + { + return row.values.at(index).compare(expectedValue); + } + + void TearDown() + { + RemoveCsvFile(); + } + + void RemoveCsvFile() + { + std::string fileDir = boost::filesystem::temp_directory_path().c_str(); + std::string filePath = fileDir + "/sampleFile.csv"; + try + { + boost::filesystem::remove(filePath); + } + catch (std::exception &e) + { + std::cerr << "Unable to delete file [" << filePath << "] : " << e.what() << std::endl; + BOOST_TEST(false); + } + } +}; + +BOOST_AUTO_TEST_SUITE(CsvReaderTest) + +BOOST_FIXTURE_TEST_CASE(TestParseVector, TestHelper) +{ + CsvReader reader; + std::vector<std::string> csvStrings; + csvStrings.reserve(2); + csvStrings.push_back("airplane, automobile , bird , \"c,a,t\""); + csvStrings.push_back("banana, shoe, \"ice\""); + + std::vector<CsvRow> row = reader.ParseVector(csvStrings); + CsvRow row1 = row[0]; + CsvRow row2 = row[1]; + + BOOST_CHECK(row.size() == 2); + + BOOST_CHECK(row1.values.size() == 4); + BOOST_CHECK(CheckStringsMatch(row1, 0, "airplane") == 0); + BOOST_CHECK(CheckStringsMatch(row1, 1, "automobile") == 0); + BOOST_CHECK(CheckStringsMatch(row1, 2, "bird") == 0); + BOOST_CHECK(CheckStringsMatch(row1, 3, "c,a,t") == 0); + + BOOST_CHECK(row2.values.size() == 3); + BOOST_CHECK(CheckStringsMatch(row2, 0, "banana") == 0); + BOOST_CHECK(CheckStringsMatch(row2, 1, "shoe") == 0); + BOOST_CHECK(CheckStringsMatch(row2, 2, "ice") == 0); +} + +BOOST_FIXTURE_TEST_CASE(TestLoadingFileFromDisk, TestHelper) +{ + CsvReader reader; + std::string theFilePath = TestHelper::CreateTempCsvFile(); + + std::vector<CsvRow> row = reader.ParseFile(theFilePath); + CsvRow row1 = row[0]; + CsvRow row2 = row[1]; + + BOOST_CHECK(row.size() == 2); + + BOOST_CHECK(row1.values.size() == 4); + BOOST_CHECK(CheckStringsMatch(row1, 0, "airplane") == 0); + BOOST_CHECK(CheckStringsMatch(row1, 1, "bicycle") == 0); + BOOST_CHECK(CheckStringsMatch(row1, 2, "bird") == 0); + BOOST_CHECK(CheckStringsMatch(row1, 3, "m,o,n,k,e,y") == 0); + + BOOST_CHECK(row2.values.size() == 3); + BOOST_CHECK(CheckStringsMatch(row2, 0, "banana") == 0); + BOOST_CHECK(CheckStringsMatch(row2, 1, "shoe") == 0); + BOOST_CHECK(CheckStringsMatch(row2, 2, "ice") == 0); +} + +BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file diff --git a/src/armnn/test/EndToEndTest.cpp b/src/armnn/test/EndToEndTest.cpp index 5ed84d22d0..4a8a0dfd81 100644 --- a/src/armnn/test/EndToEndTest.cpp +++ b/src/armnn/test/EndToEndTest.cpp @@ -11,6 +11,8 @@ #include "backends/test/QuantizeHelper.hpp" #include <boost/core/ignore_unused.hpp> +#include <set> + BOOST_AUTO_TEST_SUITE(EndToEnd) namespace @@ -47,9 +49,10 @@ BOOST_AUTO_TEST_CASE(Unsigned8) using namespace armnn; // Create runtime in which test will run - armnn::IRuntimePtr runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef)); + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); - // build up the structure of the network + // Builds up the structure of the network. armnn::INetworkPtr net(INetwork::Create()); IConnectableLayer* input = net->AddInputLayer(0, "input"); @@ -59,7 +62,7 @@ BOOST_AUTO_TEST_CASE(Unsigned8) input->GetOutputSlot(0).Connect(softmax->GetInputSlot(0)); softmax->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - // set the tensors in the network + // Sets the tensors in the network. TensorInfo inputTensorInfo(TensorShape({1, 5}), DataType::QuantisedAsymm8); inputTensorInfo.SetQuantizationOffset(100); inputTensorInfo.SetQuantizationScale(10000.0f); @@ -71,17 +74,18 @@ BOOST_AUTO_TEST_CASE(Unsigned8) softmax->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); // optimize the network - IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec()); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); - // load it into the runtime + // Loads it into the runtime. NetworkId netId; auto error = runtime->LoadNetwork(netId, std::move(optNet)); BOOST_TEST(error == Status::Success); - // create structures for input & output + // Creates structures for input & output. std::vector<uint8_t> inputData { - 1, 10, 3, 200, 5 // some inputs - one of which is sufficiently larger than the others to saturate softmax + 1, 10, 3, 200, 5 // Some inputs - one of which is sufficiently larger than the others to saturate softmax. }; std::vector<uint8_t> outputData(5); @@ -94,19 +98,19 @@ BOOST_AUTO_TEST_CASE(Unsigned8) {0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} }; - // do the inference + // Does the inference. runtime->EnqueueWorkload(netId, inputTensors, outputTensors); - // check the results + // Checks the results. BOOST_TEST(outputData[0] == 0); BOOST_TEST(outputData[1] == 0); BOOST_TEST(outputData[2] == 0); - BOOST_TEST(outputData[3] == 255); // softmax has been saturated + BOOST_TEST(outputData[3] == 255); // softmax has been saturated. BOOST_TEST(outputData[4] == 0); } template <typename T> -void ConstantUsageTest(armnn::Compute computeDevice, +void ConstantUsageTest(const std::vector<armnn::Compute>& computeDevice, const armnn::TensorInfo& commonTensorInfo, const std::vector<T>& inputData, const std::vector<T>& constantData, @@ -115,9 +119,10 @@ void ConstantUsageTest(armnn::Compute computeDevice, using namespace armnn; // Create runtime in which test will run - armnn::IRuntimePtr runtime(armnn::IRuntime::Create(computeDevice)); + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); - // build up the structure of the network + // Builds up the structure of the network. INetworkPtr net(INetwork::Create()); IConnectableLayer* input = net->AddInputLayer(0); @@ -129,19 +134,19 @@ void ConstantUsageTest(armnn::Compute computeDevice, constant->GetOutputSlot(0).Connect(add->GetInputSlot(1)); add->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - // set the tensors in the network + // Sets the tensors in the network. input->GetOutputSlot(0).SetTensorInfo(commonTensorInfo); constant->GetOutputSlot(0).SetTensorInfo(commonTensorInfo); add->GetOutputSlot(0).SetTensorInfo(commonTensorInfo); // optimize the network - IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec()); + IOptimizedNetworkPtr optNet = Optimize(*net, computeDevice, runtime->GetDeviceSpec()); - // load it into the runtime + // Loads it into the runtime. NetworkId netId; runtime->LoadNetwork(netId, std::move(optNet)); - // create structures for input & output + // Creates structures for input & output. std::vector<T> outputData(inputData.size()); InputTensors inputTensors @@ -153,26 +158,26 @@ void ConstantUsageTest(armnn::Compute computeDevice, {0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} }; - // do the inference + // Does the inference. runtime->EnqueueWorkload(netId, inputTensors, outputTensors); - // check the results + // Checks the results. BOOST_TEST(outputData == expectedOutputData); } -static void ConstantUsageFloat32Test(armnn::Compute computeDevice) +static void ConstantUsageFloat32Test(const std::vector<armnn::Compute>& computeDevice) { const armnn::TensorInfo commonTensorInfo({ 2, 3 }, armnn::DataType::Float32); ConstantUsageTest(computeDevice, commonTensorInfo, - std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // input - std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // const input - std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f } // expected output + std::vector<float>{ 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }, // Input. + std::vector<float>{ 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }, // Const input. + std::vector<float>{ 7.f, 7.f, 7.f, 7.f, 7.f, 7.f } // Expected output. ); } -static void ConstantUsageUint8Test(armnn::Compute computeDevice) +static void ConstantUsageUint8Test(const std::vector<armnn::Compute>& computeDevice) { armnn::TensorInfo commonTensorInfo({ 2, 3 }, armnn::DataType::QuantisedAsymm8); @@ -184,46 +189,49 @@ static void ConstantUsageUint8Test(armnn::Compute computeDevice) ConstantUsageTest(computeDevice, commonTensorInfo, - QuantizedVector<uint8_t>(scale, offset, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }), // input - QuantizedVector<uint8_t>(scale, offset, { 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }), // const input - QuantizedVector<uint8_t>(scale, offset, { 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }) // expected output + QuantizedVector<uint8_t>(scale, offset, { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f }), // Input. + QuantizedVector<uint8_t>(scale, offset, { 6.f, 5.f, 4.f, 3.f, 2.f, 1.f }), // Const input. + QuantizedVector<uint8_t>(scale, offset, { 7.f, 7.f, 7.f, 7.f, 7.f, 7.f }) // Expected output. ); } BOOST_AUTO_TEST_CASE(ConstantUsage_Ref_Float32) { - ConstantUsageFloat32Test(armnn::Compute::CpuRef); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + ConstantUsageFloat32Test(backends); } #if ARMCOMPUTENEON_ENABLED BOOST_AUTO_TEST_CASE(ConstantUsage_Neon_Float32) { - ConstantUsageFloat32Test(armnn::Compute::CpuAcc); + ConstantUsageFloat32Test({armnn::Compute::CpuAcc}); } #endif #if ARMCOMPUTECL_ENABLED BOOST_AUTO_TEST_CASE(ConstantUsage_Cl_Float32) { - ConstantUsageFloat32Test(armnn::Compute::GpuAcc); + ConstantUsageFloat32Test({armnn::Compute::GpuAcc}); } #endif BOOST_AUTO_TEST_CASE(ConstantUsage_Ref_Uint8) { - ConstantUsageUint8Test(armnn::Compute::CpuRef); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + ConstantUsageUint8Test(backends); } BOOST_AUTO_TEST_CASE(TrivialAdd) { - // This test was designed to match "AddTwo" in android nn/runtime/test/TestTrivialModel.cpp + // This test was designed to match "AddTwo" in android nn/runtime/test/TestTrivialModel.cpp. using namespace armnn; // Create runtime in which test will run - armnn::IRuntimePtr runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef)); + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); - // build up the structure of the network + // Builds up the structure of the network. armnn::INetworkPtr net(INetwork::Create()); IConnectableLayer* input1 = net->AddInputLayer(0); @@ -235,20 +243,21 @@ BOOST_AUTO_TEST_CASE(TrivialAdd) input2->GetOutputSlot(0).Connect(add->GetInputSlot(1)); add->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - // set the tensors in the network + // Sets the tensors in the network. TensorInfo tensorInfo(TensorShape({3, 4}), DataType::Float32); input1->GetOutputSlot(0).SetTensorInfo(tensorInfo); input2->GetOutputSlot(0).SetTensorInfo(tensorInfo); add->GetOutputSlot(0).SetTensorInfo(tensorInfo); // optimize the network - IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec()); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); - // load it into the runtime + // Loads it into the runtime. NetworkId netId; runtime->LoadNetwork(netId, std::move(optNet)); - // create structures for input & output - matching android nn test + // Creates structures for input & output - matching android nn test. std::vector<float> input1Data { 1.f, 2.f, 3.f, 4.f, 5.f, 6.f, 7.f, 8.f, 9.f, 10.f, 11.f, 12.f @@ -269,10 +278,10 @@ BOOST_AUTO_TEST_CASE(TrivialAdd) {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} }; - // do the inference + // Does the inference. runtime->EnqueueWorkload(netId, inputTensors, outputTensors); - // check the results + // Checks the results BOOST_TEST(outputData[0] == 101); BOOST_TEST(outputData[1] == 202); BOOST_TEST(outputData[2] == 303); @@ -292,9 +301,10 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs) using namespace armnn; // Create runtime in which test will run - armnn::IRuntimePtr runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef)); + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); - // build up the structure of the network + // Builds up the structure of the network. INetworkPtr net(INetwork::Create()); IConnectableLayer* input = net->AddInputLayer(0); @@ -331,7 +341,7 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs) activation2->GetOutputSlot(0).Connect(output2->GetInputSlot(0)); activation3->GetOutputSlot(0).Connect(output3->GetInputSlot(0)); - // set the tensors in the network + // Sets the tensors in the network. TensorInfo tensorInfo(TensorShape({ 10 }), DataType::Float32); input->GetOutputSlot(0).SetTensorInfo(tensorInfo); activation1->GetOutputSlot(0).SetTensorInfo(tensorInfo); @@ -339,13 +349,14 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs) activation3->GetOutputSlot(0).SetTensorInfo(tensorInfo); // optimize the network - IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec()); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); - // load it into the runtime + // Loads it into the runtime. NetworkId netId; runtime->LoadNetwork(netId, std::move(optNet)); - // create structures for input & output + // Creates structures for input & output. const std::vector<float> inputData{ 3.f, 5.f, 2.f, 3.f, 7.f, 0.f, -2.f, -1.f, 3.f, 3.f }; std::vector<float> output1Data(inputData.size()); @@ -363,32 +374,66 @@ BOOST_AUTO_TEST_CASE(MultipleOutputs) {2,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 2), output3Data.data())} }; - // do the inference + // Does the inference. runtime->EnqueueWorkload(netId, inputTensors, outputTensors); - // check the results + // Checks the results. BOOST_TEST(output1Data == std::vector<float>({ 1.f, 1.f, 1.f, 1.f, 1.f, 0.f, -1.f, -1.f, 1.f, 1.f })); // ReLu1 BOOST_TEST(output2Data == std::vector<float>({ 3.f, 5.f, 2.f, 3.f, 6.f, 0.f, 0.f, 0.f, 3.f, 3.f })); // ReLu6 BOOST_TEST(output3Data == std::vector<float>({ 3.f, 5.f, 2.f, 3.f, 5.f, 2.f, 2.f, 2.f, 3.f, 3.f })); // [2, 5] } #if ARMCOMPUTENEON_ENABLED +BOOST_AUTO_TEST_CASE(FallbackToCpuRef) +{ + using namespace armnn; + + // Create runtime in which test will run and allow fallback to CpuRef. + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input = net->AddInputLayer(0); + + // This layer configuration isn't supported by CpuAcc but we allow fallback to CpuRef so it shoud pass. + NormalizationDescriptor descriptor; + IConnectableLayer* pooling = net->AddNormalizationLayer(descriptor); + + IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(pooling->GetInputSlot(0)); + pooling->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32)); + pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32)); + + // optimize the network + std::vector<Compute> backends = {Compute::CpuAcc, Compute::CpuRef}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + + // Load it into the runtime. It should pass. + NetworkId netId; + BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Success); +} +#endif // ARMCOMPUTENEON_ENABLED + BOOST_AUTO_TEST_CASE(ErrorOnLoadNetwork) { using namespace armnn; // Create runtime in which test will run // Note we don't allow falling back to CpuRef if an operation (excluding inputs, outputs, etc.) isn't supported - armnn::IRuntime::CreationOptions options(armnn::Compute::CpuAcc); - options.m_UseCpuRefAsFallback = false; - armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); // build up the structure of the network INetworkPtr net(INetwork::Create()); IConnectableLayer* input = net->AddInputLayer(0); - // This layer configuration isn't supported by CpuAcc and isn't allowed to fall back, so LoadNetwork will fail. + // This layer configuration isn't supported by CpuAcc and isn't allowed to fall back, so Optimize will return null. NormalizationDescriptor descriptor; IConnectableLayer* pooling = net->AddNormalizationLayer(descriptor); @@ -401,12 +446,9 @@ BOOST_AUTO_TEST_CASE(ErrorOnLoadNetwork) pooling->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32)); // optimize the network - IOptimizedNetworkPtr optNet = Optimize(*net, runtime->GetDeviceSpec()); - - // Load it into the runtime. It should fail. - NetworkId netId; - BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Failure); + std::vector<Compute> backends = {Compute::CpuAcc}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(!optNet); } -#endif // ARMCOMPUTENEON_ENABLED BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/FP16SupportTest.cpp b/src/armnn/test/FP16SupportTest.cpp new file mode 100644 index 0000000000..cc3b60369c --- /dev/null +++ b/src/armnn/test/FP16SupportTest.cpp @@ -0,0 +1,114 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "armnn/ArmNN.hpp" +#include "armnn/Descriptors.hpp" +#include "Graph.hpp" +#include "armnn/IRuntime.hpp" +#include "armnn/INetwork.hpp" +#include "Optimizer.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/test/QuantizeHelper.hpp" + +#include <boost/core/ignore_unused.hpp> +#include <boost/test/unit_test.hpp> + +#include <Half.hpp> +#include <set> + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(Fp16Support) + +BOOST_AUTO_TEST_CASE(Fp16DataTypeSupport) +{ + Graph graph; + + Layer* const inputLayer1 = graph.AddLayer<InputLayer>(1, "input1"); + Layer* const inputLayer2 = graph.AddLayer<InputLayer>(2, "input2"); + + Layer* const additionLayer = graph.AddLayer<AdditionLayer>("addition"); + Layer* const outputLayer = graph.AddLayer<armnn::OutputLayer>(0, "output"); + + TensorInfo fp16TensorInfo({1, 2, 3, 5}, armnn::DataType::Float16); + inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0)); + inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1)); + additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + + inputLayer1->GetOutputSlot().SetTensorInfo(fp16TensorInfo); + inputLayer2->GetOutputSlot().SetTensorInfo(fp16TensorInfo); + additionLayer->GetOutputSlot().SetTensorInfo(fp16TensorInfo); + + BOOST_CHECK(inputLayer1->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16); + BOOST_CHECK(inputLayer2->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16); + BOOST_CHECK(additionLayer->GetOutputSlot(0).GetTensorInfo().GetDataType() == armnn::DataType::Float16); + +} + +BOOST_AUTO_TEST_CASE(Fp16AdditionTest) +{ + using namespace half_float::literal; + // Create runtime in which test will run + IRuntime::CreationOptions options; + IRuntimePtr runtime(IRuntime::Create(options)); + + // Builds up the structure of the network. + INetworkPtr net(INetwork::Create()); + + + IConnectableLayer* inputLayer1 = net->AddInputLayer(0); + IConnectableLayer* inputLayer2 = net->AddInputLayer(1); + IConnectableLayer* additionLayer = net->AddAdditionLayer(); + IConnectableLayer* outputLayer = net->AddOutputLayer(0); + + inputLayer1->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0)); + inputLayer2->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1)); + additionLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + + //change to float16 + TensorInfo fp16TensorInfo(TensorShape({4}), DataType::Float16); + inputLayer1->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo); + inputLayer2->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo); + additionLayer->GetOutputSlot(0).SetTensorInfo(fp16TensorInfo); + + // optimize the network + std::vector<Compute> backends = {Compute::GpuAcc}; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + + // Loads it into the runtime. + + NetworkId netId; + runtime->LoadNetwork(netId, std::move(optNet)); + + std::vector<Half> input1Data + { + 1.0_h, 2.0_h, 3.0_h, 4.0_h + }; + + std::vector<Half> input2Data + { + 100.0_h, 200.0_h, 300.0_h, 400.0_h + }; + + InputTensors inputTensors + { + {0,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input1Data.data())}, + {1,ConstTensor(runtime->GetInputTensorInfo(netId, 0), input2Data.data())} + }; + + std::vector<Half> outputData(input1Data.size()); + OutputTensors outputTensors + { + {0,Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} + }; + + // Does the inference. + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // Checks the results. + BOOST_TEST(outputData == std::vector<Half>({ 101.0_h, 202.0_h, 303.0_h, 404.0_h})); // Add +} + +BOOST_AUTO_TEST_SUITE_END()
\ No newline at end of file diff --git a/src/armnn/test/FloatingPointConverterTest.cpp b/src/armnn/test/FloatingPointConverterTest.cpp new file mode 100644 index 0000000000..d936e801ef --- /dev/null +++ b/src/armnn/test/FloatingPointConverterTest.cpp @@ -0,0 +1,58 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "FloatingPointConverter.hpp" +#include "Half.hpp" + +#include <malloc.h> +#include <iostream> +#include <algorithm> + +#include <boost/test/unit_test.hpp> + +BOOST_AUTO_TEST_SUITE(TestFPConversion) + +BOOST_AUTO_TEST_CASE(TestConvertFp32ToFp16) +{ + using namespace half_float::literal; + + float floatArray[] = { 1.0f, 2.0f, 0.5f, 3.1f, 2.4f, + 5.666f, 6.444f, 7.1f, 432.121f, 12.22f }; + size_t numFloats = sizeof(floatArray) / sizeof(floatArray[0]); + std::vector<armnn::Half> convertedBuffer(numFloats, 0.0_h); + + armnnUtils::FloatingPointConverter::ConvertFloat32To16(floatArray, numFloats, convertedBuffer.data()); + + for (size_t i = 0; i < numFloats; i++) + { + armnn::Half expected(floatArray[i]); + armnn::Half actual = convertedBuffer[i]; + BOOST_CHECK_EQUAL(expected, actual); + + float convertedHalf = actual; + BOOST_CHECK_CLOSE(floatArray[i], convertedHalf, 0.07); + } +} + +BOOST_AUTO_TEST_CASE(TestConvertFp16ToFp32) +{ + using namespace half_float::literal; + + armnn::Half halfArray[] = { 1.0_h, 2.0_h, 0.5_h, 3.1_h, 2.4_h, + 5.666_h, 6.444_h, 7.1_h, 432.121_h, 12.22_h }; + size_t numFloats = sizeof(halfArray) / sizeof(halfArray[0]); + std::vector<float> convertedBuffer(numFloats, 0.0f); + + armnnUtils::FloatingPointConverter::ConvertFloat16To32(halfArray, numFloats, convertedBuffer.data()); + + for (size_t i = 0; i < numFloats; i++) + { + float expected(halfArray[i]); + float actual = convertedBuffer[i]; + BOOST_CHECK_EQUAL(expected, actual); + } +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/GraphTests.cpp b/src/armnn/test/GraphTests.cpp index 99789e4737..ccbcb8b00b 100644 --- a/src/armnn/test/GraphTests.cpp +++ b/src/armnn/test/GraphTests.cpp @@ -15,7 +15,7 @@ #include <boost/cast.hpp> -/// checks that first comes before second in the order +/// Checks that first comes before second in the order. bool CheckOrder(const armnn::Graph& graph, const armnn::Layer* first, const armnn::Layer* second) { graph.Print(); @@ -69,7 +69,7 @@ BOOST_AUTO_TEST_CASE(TopologicalSort) armnn::Layer* const layerE = GetFirstLayerWithName(graph, "layerE"); armnn::Layer* const layerD = GetFirstLayerWithName(graph, "layerD"); - // simple graph which branches and rejoins + // Simple graph which branches and rejoins. // A // / \' // D E @@ -92,7 +92,7 @@ BOOST_AUTO_TEST_CASE(TopologicalSort) BOOST_TEST(CheckOrder(graph, layerB, layerC)); } -BOOST_AUTO_TEST_CASE(InsertNewLayer) +BOOST_AUTO_TEST_CASE(InsertNewLayerBefore) { armnn::Graph graph; armnn::TensorInfo tensorInfo({ 1, 1, 1, 1 }, armnn::DataType::Float32); @@ -128,7 +128,7 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer) layerC->GetOutputSlot(0).Connect(layerD->GetInputSlot(1)); layerD->GetOutputSlot(0).Connect(layerO->GetInputSlot(0)); - // check order is valid + // Checks order is valid. BOOST_TEST(CheckOrder(graph, layerA, layerB)); BOOST_TEST(CheckOrder(graph, layerA, layerC)); BOOST_TEST(CheckOrder(graph, layerB, layerD)); @@ -147,7 +147,7 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer) armnn::Layer* const layerE = GetFirstLayerWithName(graph, "layerE"); - // check order is valid + // Checks order is valid. BOOST_TEST(CheckOrder(graph, layerA, layerB)); BOOST_TEST(CheckOrder(graph, layerA, layerC)); BOOST_TEST(CheckOrder(graph, layerB, layerD)); @@ -169,7 +169,7 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer) armnn::Layer* const layerF = GetFirstLayerWithName(graph, "layerF"); - // check order is valid + // Checks order is valid. BOOST_TEST(CheckOrder(graph, layerA, layerB)); BOOST_TEST(CheckOrder(graph, layerA, layerF)); BOOST_TEST(CheckOrder(graph, layerF, layerC)); @@ -178,6 +178,93 @@ BOOST_AUTO_TEST_CASE(InsertNewLayer) BOOST_TEST(CheckOrder(graph, layerE, layerD)); } +BOOST_AUTO_TEST_CASE(InsertNewLayerAfter) +{ + armnn::Graph graph; + armnn::TensorInfo tensorInfo({ 1, 1, 1, 1 }, armnn::DataType::Float32); + + std::vector<armnn::Layer*> order; + + armnn::ActivationDescriptor activationDefaults; + BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::InputLayer>(0, "layerA")); + BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::ActivationLayer>(activationDefaults, "layerB")); + BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::ActivationLayer>(activationDefaults, "layerC")); + BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::AdditionLayer>("layerD")); + BOOST_CHECK_NO_THROW(graph.AddLayer<armnn::OutputLayer>(0, "output")); + + armnn::Layer* const layerA = GetFirstLayerWithName(graph, "layerA"); + armnn::Layer* const layerB = GetFirstLayerWithName(graph, "layerB"); + armnn::Layer* const layerC = GetFirstLayerWithName(graph, "layerC"); + armnn::Layer* const layerD = GetFirstLayerWithName(graph, "layerD"); + armnn::Layer* const layerO = GetFirstLayerWithName(graph, "output"); + + // A + // / \' + // B C + // \ / + // D + layerA->GetOutputSlot(0).SetTensorInfo(tensorInfo); + layerB->GetOutputSlot(0).SetTensorInfo(tensorInfo); + layerC->GetOutputSlot(0).SetTensorInfo(tensorInfo); + layerD->GetOutputSlot(0).SetTensorInfo(tensorInfo); + + layerA->GetOutputSlot(0).Connect(layerB->GetInputSlot(0)); + layerA->GetOutputSlot(0).Connect(layerC->GetInputSlot(0)); + layerB->GetOutputSlot(0).Connect(layerD->GetInputSlot(0)); + layerC->GetOutputSlot(0).Connect(layerD->GetInputSlot(1)); + layerD->GetOutputSlot(0).Connect(layerO->GetInputSlot(0)); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layerA, layerB)); + BOOST_TEST(CheckOrder(graph, layerA, layerC)); + BOOST_TEST(CheckOrder(graph, layerB, layerD)); + BOOST_TEST(CheckOrder(graph, layerC, layerD)); + + // A + // / \' + // B C + // \ | + // \ E + // \| + // D + BOOST_CHECK_NO_THROW(graph.InsertNewLayer<armnn::ActivationLayer>(layerC->GetOutputSlot(), + activationDefaults, + "layerE")); + + armnn::Layer* const layerE = GetFirstLayerWithName(graph, "layerE"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layerA, layerB)); + BOOST_TEST(CheckOrder(graph, layerA, layerC)); + BOOST_TEST(CheckOrder(graph, layerB, layerD)); + BOOST_TEST(CheckOrder(graph, layerC, layerE)); + BOOST_TEST(CheckOrder(graph, layerE, layerD)); + + + // A + // | + // F + // / \' + // B C + // \ | + // \ E + // \ / + // D + BOOST_CHECK_NO_THROW(graph.InsertNewLayer<armnn::ActivationLayer>(layerA->GetOutputSlot(), + activationDefaults, + "layerF")); + + armnn::Layer* const layerF = GetFirstLayerWithName(graph, "layerF"); + + // Checks order is valid. + BOOST_TEST(CheckOrder(graph, layerA, layerF)); + BOOST_TEST(CheckOrder(graph, layerF, layerB)); + BOOST_TEST(CheckOrder(graph, layerF, layerC)); + BOOST_TEST(CheckOrder(graph, layerB, layerD)); + BOOST_TEST(CheckOrder(graph, layerC, layerE)); + BOOST_TEST(CheckOrder(graph, layerE, layerD)); +} + namespace { using Edge = std::pair<const armnn::Layer*, const armnn::Layer*>; @@ -210,7 +297,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn std::vector<Edge> origEdges = GetEdgeList(origGraph); std::vector<Edge> newEdges = GetEdgeList(graph); - // Adding copy layers should not produce any duplicate edges + // Adding copy layers should not produce any duplicate edges. { std::vector<Edge> sortedNewEdges = newEdges; std::sort(sortedNewEdges.begin(), sortedNewEdges.end()); @@ -219,7 +306,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn BOOST_CHECK_MESSAGE(last == sortedNewEdges.end(), "New graph contains duplicate edges!"); } - // Each new edge must be tested + // Each new edge must be tested. while (!newEdges.empty()) { const Edge edge = std::move(newEdges.back()); @@ -251,7 +338,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn BOOST_TEST((srcLayer->GetComputeDevice() == dstLayer->GetComputeDevice())); } - // Mark edge in original graph as observed (by deleting it) + // Marks edge in original graph as observed (by deleting it). origEdges.erase(origEdges.begin() + originalEdge); } else @@ -288,7 +375,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn const armnn::Layer* copyLayer = srcLayerInOrigGraph ? edge.second : edge.first; const armnn::Layer* nonCopyLayer = srcLayerInOrigGraph ? srcLayer : dstLayer; - // Find all edges connecting the copy layer to other layers + // Finds all edges connecting the copy layer to other layers. std::vector<Edge> adjEdges; auto it = newEdges.begin(); while (it != newEdges.end()) @@ -298,7 +385,7 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn { adjEdges.push_back(newEdge); - // Since the adjacent edge is immediately tested below, no need to consider it afterwards + // Since the adjacent edge is immediately tested below, there is no need to consider it afterwards. it = newEdges.erase(it); } else @@ -315,10 +402,10 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn continue; } - // Test adjacent edges now + // Tests adjacent edges now. for (const Edge& adjEdge : adjEdges) { - // The adjacent edge must connect the copy layer to another layer + // The adjacent edge must connect the copy layer to another layer. const armnn::Layer* adjLayer = srcLayerInOrigGraph ? adjEdge.second : adjEdge.first; if (!adjLayer) @@ -329,10 +416,10 @@ static void TestGraphAfterAddingCopyLayers(const armnn::Graph& graph, const armn continue; } - // Both layers must have different compute devices + // Both layers must have different compute devices. BOOST_TEST((nonCopyLayer->GetComputeDevice() != adjLayer->GetComputeDevice())); - // There must exist an edge connecting both layers directly in the original graph + // There must exist an edge connecting both layers directly in the original graph. { const armnn::Layer* origEdgeN1 = srcLayerInOrigGraph ? nonCopyLayer : adjLayer; const armnn::Layer* origEdgeN2 = srcLayerInOrigGraph ? adjLayer : nonCopyLayer; @@ -434,7 +521,7 @@ BOOST_FIXTURE_TEST_CASE(AddCopyLayersSeveralTimes, CopyLayersFixture) { m_Graph.AddCopyLayers(); - // Calling AddCopyLayers() several times should not change the connections + // Calling AddCopyLayers() several times should not change the connections. const std::vector<Edge> edges = GetEdgeList(m_Graph); for (int i = 0; i < 4; ++i) { diff --git a/src/armnn/test/InstrumentTests.cpp b/src/armnn/test/InstrumentTests.cpp new file mode 100644 index 0000000000..a219b39b0d --- /dev/null +++ b/src/armnn/test/InstrumentTests.cpp @@ -0,0 +1,62 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include <boost/test/unit_test.hpp> + +#include "WallClockTimer.hpp" + +#include <chrono> +#include <thread> + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(Instruments) + +BOOST_AUTO_TEST_CASE(WallClockTimerInMilliseconds) +{ + WallClockTimer wallClockTimer; + + BOOST_CHECK_EQUAL(wallClockTimer.GetName(), "WallClockTimer"); + + // start the timer + wallClockTimer.Start(); + + // wait for 10 milliseconds + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // stop the timer + wallClockTimer.Stop(); + + BOOST_CHECK_EQUAL(wallClockTimer.GetMeasurements().front().m_Name, WallClockTimer::WALL_CLOCK_TIME); + + // check that WallClockTimer measurement should be >= 10 milliseconds + BOOST_CHECK_GE(wallClockTimer.GetMeasurements().front().m_Value, std::chrono::milliseconds(10).count()); +} + +BOOST_AUTO_TEST_CASE(WallClockTimerInNanoseconds) +{ + WallClockTimer wallClockTimer; + + BOOST_CHECK_EQUAL(wallClockTimer.GetName(), "WallClockTimer"); + + // start the timer + wallClockTimer.Start(); + + // wait for 500 nanoseconds - 0.0005 milliseconds + std::this_thread::sleep_for(std::chrono::nanoseconds(500)); + + // stop the timer + wallClockTimer.Stop(); + + BOOST_CHECK_EQUAL(wallClockTimer.GetMeasurements().front().m_Name, WallClockTimer::WALL_CLOCK_TIME); + + // delta is 0.0005 milliseconds + const auto delta = + std::chrono::duration_cast<std::chrono::duration<double, std::milli>>(std::chrono::nanoseconds(500)); + + // check that WallClockTimer measurement should be >= 0.0005 milliseconds + BOOST_CHECK_GE(wallClockTimer.GetMeasurements().front().m_Value, delta.count()); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/JsonPrinterTests.cpp b/src/armnn/test/JsonPrinterTests.cpp new file mode 100644 index 0000000000..28cbfd61a5 --- /dev/null +++ b/src/armnn/test/JsonPrinterTests.cpp @@ -0,0 +1,378 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include <boost/test/unit_test.hpp> +#include <boost/algorithm/string.hpp> +#include <boost/lexical_cast.hpp> +#include <stack> +#include <string> +#include <vector> +#include <sstream> + +#include "Profiling.hpp" +#include "armnn/Descriptors.hpp" +#include "armnn/IRuntime.hpp" +#include "armnn/INetwork.hpp" +#include "backends/test/ClContextControlFixture.hpp" +#include "backends/ClWorkloadFactory.hpp" + +BOOST_FIXTURE_TEST_SUITE(JsonPrinterTests, ClProfilingContextControlFixture) + +bool AreMatchingPair(const char opening, const char closing) +{ + return (opening == '{' && closing == '}') || (opening == '[' && closing == ']'); +} + +bool AreParenthesesMatching(const std::string& exp) +{ + std::stack<char> expStack; + for (size_t i = 0; i < exp.length(); ++i) + { + if (exp[i] == '{' || exp[i] == '[') + { + expStack.push(exp[i]); + } + else if (exp[i] == '}' || exp[i] == ']') + { + if (expStack.empty() || !AreMatchingPair(expStack.top(), exp[i])) + { + return false; + } + else + { + expStack.pop(); + } + } + } + return expStack.empty(); +} + +std::vector<double> ExtractMeasurements(const std::string& exp) +{ + std::vector<double> numbers; + bool inArray = false; + std::string numberString; + for (size_t i = 0; i < exp.size(); ++i) + { + if (exp[i] == '[') + { + inArray = true; + } + else if (exp[i] == ']' && inArray) + { + try + { + boost::trim_if(numberString, boost::is_any_of("\t,\n")); + numbers.push_back(std::stod(numberString)); + } + catch (std::invalid_argument const& e) + { + BOOST_FAIL("Could not convert measurements to double: " + numberString); + } + + numberString.clear(); + inArray = false; + } + else if (exp[i] == ',' && inArray) + { + try + { + boost::trim_if(numberString, boost::is_any_of("\t,\n")); + numbers.push_back(std::stod(numberString)); + } + catch (std::invalid_argument const& e) + { + BOOST_FAIL("Could not convert measurements to double: " + numberString); + } + numberString.clear(); + } + else if (exp[i] != '[' && inArray && exp[i] != ',' && exp[i] != ' ') + { + numberString += exp[i]; + } + } + return numbers; +} + +std::vector<std::string> ExtractSections(const std::string& exp) +{ + std::vector<std::string> sections; + + std::stack<size_t> s; + for (size_t i = 0; i < exp.size(); i++) + { + if (exp.at(i) == '{') + { + s.push(i); + } + else if (exp.at(i) == '}') + { + size_t from = s.top(); + s.pop(); + sections.push_back(exp.substr(from, i - from + 1)); + } + } + + return sections; +} + +std::string SoftmaxProfilerTestSetupHelper(const std::vector<armnn::Compute>& backends) +{ + using namespace armnn; + + BOOST_CHECK(!backends.empty()); + + ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + + // Create runtime in which test will run + IRuntime::CreationOptions options; + options.m_EnableGpuProfiling = backends.front() == armnn::Compute::GpuAcc; + IRuntimePtr runtime(IRuntime::Create(options)); + + // build up the structure of the network + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input = net->AddInputLayer(0, "input"); + IConnectableLayer* softmax = net->AddSoftmaxLayer(SoftmaxDescriptor(), "softmax"); + IConnectableLayer* output = net->AddOutputLayer(0, "output"); + + input->GetOutputSlot(0).Connect(softmax->GetInputSlot(0)); + softmax->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + // set the tensors in the network + TensorInfo inputTensorInfo(TensorShape({1, 5}), DataType::QuantisedAsymm8); + inputTensorInfo.SetQuantizationOffset(100); + inputTensorInfo.SetQuantizationScale(10000.0f); + input->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); + + TensorInfo outputTensorInfo(TensorShape({1, 5}), DataType::QuantisedAsymm8); + outputTensorInfo.SetQuantizationOffset(0); + outputTensorInfo.SetQuantizationScale(1.0f / 256.0f); + softmax->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); + + // optimize the network + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + if(!optNet) + { + BOOST_FAIL("Error occurred during Optimization, Optimize() returned nullptr."); + } + // load it into the runtime + NetworkId netId; + auto error = runtime->LoadNetwork(netId, std::move(optNet)); + BOOST_TEST(error == Status::Success); + + // create structures for input & output + std::vector<uint8_t> inputData + { + 1, 10, 3, 200, 5 + // one of inputs is sufficiently larger than the others to saturate softmax + }; + std::vector<uint8_t> outputData(5); + + armnn::InputTensors inputTensors + { + {0, armnn::ConstTensor(runtime->GetInputTensorInfo(netId, 0), inputData.data())} + }; + armnn::OutputTensors outputTensors + { + {0, armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), outputData.data())} + }; + + runtime->GetProfiler(netId)->EnableProfiling(true); + + // do the inferences + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + runtime->EnqueueWorkload(netId, inputTensors, outputTensors); + + // retrieve the Profiler.Print() output + std::stringstream ss; + profilerManager.GetProfiler()->Print(ss); + + return ss.str(); +} + +void SoftmaxProfilerTestValidationHelper(std::string& result, const std::string& testData) +{ + // ensure all measurements are greater than zero + std::vector<double> measurementsVector = ExtractMeasurements(result); + BOOST_CHECK(!measurementsVector.empty()); + + // check sections contain raw and unit tags + // first ensure Parenthesis are balanced + if (AreParenthesesMatching(result)) + { + // remove parent sections that will not have raw or unit tag + std::vector<std::string> sectionVector = ExtractSections(result); + for (size_t i = 0; i < sectionVector.size(); ++i) + { + if (boost::contains(sectionVector[i], "\"ArmNN\":") + || boost::contains(sectionVector[i], "\"inference_measurements\":")) + { + sectionVector.erase(sectionVector.begin() + static_cast<int>(i)); + } + } + BOOST_CHECK(!sectionVector.empty()); + + BOOST_CHECK(std::all_of(sectionVector.begin(), sectionVector.end(), + [](std::string i) { return boost::contains(i, "\"raw\":"); })); + + BOOST_CHECK(std::all_of(sectionVector.begin(), sectionVector.end(), + [](std::string i) { return boost::contains(i, "\"unit\":"); })); + } + + // remove the time measurements as they vary from test to test + result.erase(std::remove_if (result.begin(),result.end(), + [](char c) { return c == '.'; }), result.end()); + result.erase(std::remove_if (result.begin(), result.end(), &isdigit), result.end()); + result.erase(std::remove_if (result.begin(),result.end(), + [](char c) { return c == '\t'; }), result.end()); + + BOOST_CHECK(boost::contains(result, "ArmNN")); + BOOST_CHECK(boost::contains(result, "inference_measurements")); + BOOST_CHECK(boost::contains(result, "layer_measurements")); + BOOST_CHECK_EQUAL(result, testData); + + // ensure no spare parenthesis present in print output + BOOST_CHECK(AreParenthesesMatching(result)); +} + +void SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult( + const std::vector<armnn::Compute>& backends) +{ + // setup the test fixture and obtain JSON Printer result + std::string result = SoftmaxProfilerTestSetupHelper(backends); + + std::string backend = "Ref"; + std::string changeLine31 = "\n},\n\"CopyMemGeneric_Execute\": {"; + std::string changeLine39 = "ms\""; + std::string changeLine40; + std::string changeLine45; + + switch(backends[0]) { + case armnn::Compute::GpuAcc: backend = "Cl"; + changeLine31 = ",\n\"OpenClKernelTimer/: softmax_layer_max_shift_exp_sum_quantized_serial GWS[,,]\": {"; + changeLine39 = R"(us" +}, +"OpenClKernelTimer/: softmax_layer_norm_quantized GWS[,,]": { +"raw": [ +, +, + +], +"unit": "us")"; + + changeLine40 = R"( +}, +"CopyMemGeneric_Execute": { +"raw": [ +, +, + +], +"unit": "ms")"; + changeLine45 = "}\n"; + break; + case armnn::Compute::CpuAcc: backend = "Neon"; + changeLine31 = ",\n\"NeonKernelTimer/: NEFillBorderKernel\": {"; + changeLine39 = R"(ms" +}, +"NeonKernelTimer/: NELogitsDMaxKernel": { +"raw": [ +, +, + +], +"unit": "ms" +}, +"NeonKernelTimer/: NELogitsDSoftmaxKernel": { +"raw": [ +, +, + +], +"unit": "ms")"; + changeLine40 = R"( +}, +"CopyMemGeneric_Execute": { +"raw": [ +, +, + +], +"unit": "ms")"; + changeLine45 = "}\n"; + break; + default: + break; + } + std::string testData = R"({ +"ArmNN": { +"inference_measurements": { +"raw": [ +, +, + +], +"unit": "ms", +"layer_measurements": { +"raw": [ +, +, + +], +"unit": "ms", +"CopyMemGeneric_Execute": { +"raw": [ +, +, + +], +"unit": "ms" +}, +")" + backend + R"(SoftmaxUintWorkload_Execute": { +"raw": [ +, +, + +], +"unit": "ms")" + changeLine31 + R"( +"raw": [ +, +, + +], +"unit": ")" + changeLine39 + R"( +})" + changeLine40 + R"( +} +} +} +} +)" + changeLine45 + R"()"; + + // validate the JSON Printer result + SoftmaxProfilerTestValidationHelper(result, testData); +} + +BOOST_AUTO_TEST_CASE(SoftmaxProfilerJSONPrinterCpuRefTest) +{ + SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult({armnn::Compute::CpuRef}); +} + + +#if ARMCOMPUTENEON_ENABLED +BOOST_AUTO_TEST_CASE(SoftmaxProfilerJSONPrinterCpuAccTest) +{ + SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult({armnn::Compute::CpuAcc}); +} +#endif + +#if ARMCOMPUTECL_ENABLED +BOOST_AUTO_TEST_CASE(SoftmaxProfilerJSONPrinterGpuAccTest) +{ + SetupSoftmaxProfilerWithSpecifiedBackendsAndValidateJSONPrinterResult({armnn::Compute::GpuAcc}); +} +#endif + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/NeonTimerTest.cpp b/src/armnn/test/NeonTimerTest.cpp new file mode 100644 index 0000000000..4502756e07 --- /dev/null +++ b/src/armnn/test/NeonTimerTest.cpp @@ -0,0 +1,104 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include "NeonTimer.hpp" +#include "TensorHelpers.hpp" + +#include "armnn/ArmNN.hpp" +#include "armnn/Tensor.hpp" +#include "armnn/TypesUtils.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "backends/NeonWorkloadFactory.hpp" +#include "backends/WorkloadInfo.hpp" +#include "backends/WorkloadFactory.hpp" +#include "backends/test/LayerTests.hpp" +#include "backends/test/TensorCopyUtils.hpp" +#include "backends/test/WorkloadTestUtils.hpp" + +#include <boost/test/unit_test.hpp> +#include <cstdlib> +#include <algorithm> + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(NeonTimerInstrument) + + +BOOST_AUTO_TEST_CASE(NeonTimerGetName) +{ + NeonTimer neonTimer; + BOOST_CHECK_EQUAL(neonTimer.GetName(), "NeonKernelTimer"); +} + +BOOST_AUTO_TEST_CASE(NeonTimerMeasure) +{ + NeonWorkloadFactory workloadFactory; + + unsigned int inputWidth = 4000u; + unsigned int inputHeight = 5000u; + unsigned int inputChannels = 1u; + unsigned int inputBatchSize = 1u; + + float upperBound = 1.0f; + float lowerBound = -1.0f; + + size_t inputSize = inputWidth * inputHeight * inputChannels * inputBatchSize; + std::vector<float> inputData(inputSize, 0.f); + std::generate(inputData.begin(), inputData.end(), [](){ + return (static_cast<float>(rand()) / static_cast<float>(RAND_MAX / 3)) + 1.f; }); + + unsigned int outputWidth = inputWidth; + unsigned int outputHeight = inputHeight; + unsigned int outputChannels = inputChannels; + unsigned int outputBatchSize = inputBatchSize; + + armnn::TensorInfo inputTensorInfo({ inputBatchSize, inputChannels, inputHeight, inputWidth }, + armnn::GetDataType<float>()); + + armnn::TensorInfo outputTensorInfo({ outputBatchSize, outputChannels, outputHeight, outputWidth }, + armnn::GetDataType<float>()); + + LayerTestResult<float, 4> result(inputTensorInfo); + + auto input = MakeTensor<float, 4>(inputTensorInfo, inputData); + + std::unique_ptr<armnn::ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<armnn::ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + // Setup bounded ReLu + armnn::ActivationQueueDescriptor descriptor; + armnn::WorkloadInfo workloadInfo; + AddInputToWorkload(descriptor, workloadInfo, inputTensorInfo, inputHandle.get()); + AddOutputToWorkload(descriptor, workloadInfo, outputTensorInfo, outputHandle.get()); + + descriptor.m_Parameters.m_Function = armnn::ActivationFunction::BoundedReLu; + descriptor.m_Parameters.m_A = upperBound; + descriptor.m_Parameters.m_B = lowerBound; + + std::unique_ptr<armnn::IWorkload> workload = workloadFactory.CreateActivation(descriptor, workloadInfo); + + inputHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]); + + NeonTimer neonTimer; + // Start the timer. + neonTimer.Start(); + // Execute the workload. + workload->Execute(); + // Stop the timer. + neonTimer.Stop(); + + std::vector<Measurement> measurements = neonTimer.GetMeasurements(); + + BOOST_CHECK_EQUAL(measurements.size(), 2); + BOOST_CHECK_EQUAL(measurements[0].m_Name, "NeonKernelTimer/0: NEFillBorderKernel"); + BOOST_CHECK(measurements[0].m_Value > 0.0); + BOOST_CHECK_EQUAL(measurements[1].m_Name, "NeonKernelTimer/1: NEActivationLayerKernel"); + BOOST_CHECK(measurements[1].m_Value > 0.0); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/NetworkTests.cpp b/src/armnn/test/NetworkTests.cpp new file mode 100644 index 0000000000..66fa327221 --- /dev/null +++ b/src/armnn/test/NetworkTests.cpp @@ -0,0 +1,968 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include <boost/test/unit_test.hpp> + +#include "armnn/ArmNN.hpp" +#include "Network.hpp" +#include "Graph.hpp" +#include "backends/RefWorkloadFactory.hpp" +#include "backends/ClWorkloadFactory.hpp" +#include "backends/NeonWorkloadFactory.hpp" + +#include "GraphUtils.hpp" + +namespace +{ + +bool AreAllLayerInputSlotsConnected(const armnn::IConnectableLayer& layer) +{ + bool allConnected = true; + for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i) + { + const bool inputConnected = layer.GetInputSlot(i).GetConnection() != nullptr; + allConnected &= inputConnected; + } + return allConnected; +} + +} + +BOOST_AUTO_TEST_SUITE(Network) + +BOOST_AUTO_TEST_CASE(LayerGuids) +{ + armnn::Network net; + armnn::LayerGuid inputId = net.AddInputLayer(0)->GetGuid(); + armnn::LayerGuid addId = net.AddAdditionLayer()->GetGuid(); + armnn::LayerGuid outputId = net.AddOutputLayer(0)->GetGuid(); + + BOOST_TEST(inputId != addId); + BOOST_TEST(addId != outputId); + BOOST_TEST(inputId != outputId); +} + +BOOST_AUTO_TEST_CASE(SerializeToDot) +{ + armnn::Network net; + + //Defines layers. + auto input = net.AddInputLayer(0); + auto add = net.AddAdditionLayer(); + auto output = net.AddOutputLayer(0); + + // Connects layers. + input->GetOutputSlot(0).Connect(add->GetInputSlot(0)); + input->GetOutputSlot(0).Connect(add->GetInputSlot(1)); + add->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + armnn::TensorShape shape({4}); + armnn::TensorInfo info(shape, armnn::DataType::Float32); + input->GetOutputSlot(0).SetTensorInfo(info); + add->GetOutputSlot(0).SetTensorInfo(info); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec()); + + std::ostringstream ss; + optimizedNet->SerializeToDot(ss); + + auto inputId = input->GetGuid(); + auto addId = add->GetGuid(); + auto outputId = output->GetGuid(); + + std::stringstream expected; + expected << + "digraph Optimized {\n" + " node [shape=\"record\"];\n" + " edge [fontsize=8 fontcolor=\"blue\" fontname=\"arial-bold\"];\n" + " " << inputId << " [label=\"{Input}\"];\n" + " " << addId << " [label=\"{Addition}\"];\n" + " " << outputId << " [label=\"{Output}\"];\n" + " " << inputId << " -> " << addId << " [label=< [4] >];\n" + " " << inputId << " -> " << addId << " [label=< [4] >];\n" + " " << addId << " -> " << outputId << " [label=< [4] >];\n" + "}\n"; + + BOOST_TEST(ss.str() == expected.str()); +} + +BOOST_AUTO_TEST_CASE(NetworkBasic) +{ + armnn::Network net; + BOOST_TEST(net.PrintGraph() == armnn::Status::Success); +} + +BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForINetwork) +{ + armnn::Network net; + armnn::INetwork& inet = net; + inet.AddInputLayer(0); + inet.AddAdditionLayer(); + inet.AddActivationLayer(armnn::ActivationDescriptor()); + inet.AddOutputLayer(0); +} + +BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForNetwork) +{ + armnn::Network net; + net.AddInputLayer(0); + net.AddAdditionLayer(); + net.AddActivationLayer(armnn::ActivationDescriptor()); + net.AddOutputLayer(0); +} + +BOOST_AUTO_TEST_CASE(NetworkModification) +{ + armnn::Network net; + + armnn::IConnectableLayer* const inputLayer = net.AddInputLayer(0, "input layer"); + BOOST_TEST(inputLayer); + + unsigned int dims[] = { 10,1,1,1 }; + std::vector<float> convWeightsData(10); + armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float32), convWeightsData); + + armnn::Convolution2dDescriptor convDesc2d; + armnn::IConnectableLayer* const convLayer = net.AddConvolution2dLayer(convDesc2d, weights, "conv layer"); + BOOST_TEST(convLayer); + + inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); + + armnn::FullyConnectedDescriptor fullyConnectedDesc; + armnn::IConnectableLayer* const fullyConnectedLayer = net.AddFullyConnectedLayer(fullyConnectedDesc, + weights, + "fully connected"); + BOOST_TEST(fullyConnectedLayer); + + convLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0)); + + armnn::Pooling2dDescriptor pooling2dDesc; + armnn::IConnectableLayer* const poolingLayer = net.AddPooling2dLayer(pooling2dDesc, "pooling2d"); + BOOST_TEST(poolingLayer); + + fullyConnectedLayer->GetOutputSlot(0).Connect(poolingLayer->GetInputSlot(0)); + + armnn::ActivationDescriptor activationDesc; + armnn::IConnectableLayer* const activationLayer = net.AddActivationLayer(activationDesc, "activation"); + BOOST_TEST(activationLayer); + + poolingLayer->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0)); + + armnn::NormalizationDescriptor normalizationDesc; + armnn::IConnectableLayer* const normalizationLayer = net.AddNormalizationLayer(normalizationDesc, "normalization"); + BOOST_TEST(normalizationLayer); + + activationLayer->GetOutputSlot(0).Connect(normalizationLayer->GetInputSlot(0)); + + armnn::SoftmaxDescriptor softmaxDesc; + armnn::IConnectableLayer* const softmaxLayer = net.AddSoftmaxLayer(softmaxDesc, "softmax"); + BOOST_TEST(softmaxLayer); + + normalizationLayer->GetOutputSlot(0).Connect(softmaxLayer->GetInputSlot(0)); + + armnn::BatchNormalizationDescriptor batchNormDesc; + + armnn::TensorInfo tensorInfo({ 1 }, armnn::DataType::Float32); + std::vector<float> data(tensorInfo.GetNumBytes() / sizeof(float)); + armnn::ConstTensor invalidTensor(tensorInfo, data); + + armnn::IConnectableLayer* const batchNormalizationLayer = net.AddBatchNormalizationLayer(batchNormDesc, + invalidTensor, + invalidTensor, + invalidTensor, + invalidTensor, + "batch norm"); + BOOST_TEST(batchNormalizationLayer); + + softmaxLayer->GetOutputSlot(0).Connect(batchNormalizationLayer->GetInputSlot(0)); + + armnn::IConnectableLayer* const additionLayer = net.AddAdditionLayer("addition"); + BOOST_TEST(additionLayer); + + batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0)); + batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1)); + + armnn::IConnectableLayer* const multiplicationLayer = net.AddMultiplicationLayer("multiplication"); + BOOST_TEST(multiplicationLayer); + + additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(0)); + additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(1)); + + armnn::IConnectableLayer* const outputLayer = net.AddOutputLayer(0, "output layer"); + BOOST_TEST(outputLayer); + + multiplicationLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + + //Tests that all layers are present in the graph. + BOOST_TEST(net.GetGraph().GetNumLayers() == 11); + + //Tests that the vertices exist and have correct names. + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "input layer")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "conv layer")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "fully connected")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "pooling2d")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "activation")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "normalization")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "softmax")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "batch norm")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "addition")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "multiplication")); + BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "output layer")); + + auto checkOneOutputToOneInputConnection = [] + (const armnn::IConnectableLayer* const srcLayer, + const armnn::IConnectableLayer* const tgtLayer, + int expectedSrcNumInputs = 1, + int expectedDstNumOutputs = 1) + { + BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs); + BOOST_TEST(srcLayer->GetNumOutputSlots() == 1); + BOOST_TEST(tgtLayer->GetNumInputSlots() == 1); + BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs); + + BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 1); + BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(0) == &tgtLayer->GetInputSlot(0)); + BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(0).GetConnection()); + }; + auto checkOneOutputToTwoInputsConnections = [] + (const armnn::IConnectableLayer* const srcLayer, + const armnn::IConnectableLayer* const tgtLayer, + int expectedSrcNumInputs, + int expectedDstNumOutputs = 1) + { + BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs); + BOOST_TEST(srcLayer->GetNumOutputSlots() == 1); + BOOST_TEST(tgtLayer->GetNumInputSlots() == 2); + BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs); + + BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 2); + for (unsigned int i = 0; i < srcLayer->GetOutputSlot(0).GetNumConnections(); ++i) + { + BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(i) == &tgtLayer->GetInputSlot(i)); + BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(i).GetConnection()); + } + }; + + BOOST_TEST(AreAllLayerInputSlotsConnected(*convLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*fullyConnectedLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*poolingLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*activationLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*normalizationLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*softmaxLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*batchNormalizationLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*additionLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*multiplicationLayer)); + BOOST_TEST(AreAllLayerInputSlotsConnected(*outputLayer)); + + // Checks connectivity. + checkOneOutputToOneInputConnection(inputLayer, convLayer, 0); + checkOneOutputToOneInputConnection(convLayer, fullyConnectedLayer); + checkOneOutputToOneInputConnection(fullyConnectedLayer, poolingLayer); + checkOneOutputToOneInputConnection(poolingLayer, activationLayer); + checkOneOutputToOneInputConnection(activationLayer, normalizationLayer); + checkOneOutputToOneInputConnection(normalizationLayer, softmaxLayer); + checkOneOutputToOneInputConnection(softmaxLayer, batchNormalizationLayer); + checkOneOutputToTwoInputsConnections(batchNormalizationLayer, additionLayer, 1); + checkOneOutputToTwoInputsConnections(additionLayer, multiplicationLayer, 2); + checkOneOutputToOneInputConnection(multiplicationLayer, outputLayer, 2, 0); +} + +BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMerger) +{ + armnn::Network net; + + // Adds an input layer and an input tensor descriptor. + armnn::IConnectableLayer* inputLayer = net.AddInputLayer(0, "input layer"); + BOOST_TEST(inputLayer); + + // Adds a splitter layer. + armnn::ViewsDescriptor splitterDesc(2,4); + + armnn::IConnectableLayer* splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer"); + BOOST_TEST(splitterLayer); + + inputLayer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0)); + + // Adds a softmax layer 1. + armnn::SoftmaxDescriptor softmaxDescriptor; + armnn::IConnectableLayer* softmaxLayer1 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1"); + BOOST_TEST(softmaxLayer1); + + splitterLayer->GetOutputSlot(0).Connect(softmaxLayer1->GetInputSlot(0)); + + // Adds a softmax layer 2. + armnn::IConnectableLayer* softmaxLayer2 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2"); + BOOST_TEST(softmaxLayer2); + + splitterLayer->GetOutputSlot(1).Connect(softmaxLayer2->GetInputSlot(0)); + + // Adds a merger layer. + armnn::OriginsDescriptor mergerDesc(2, 4); + + armnn::IConnectableLayer* mergerLayer = net.AddMergerLayer(mergerDesc, "merger layer"); + BOOST_TEST(mergerLayer); + + softmaxLayer1->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(0)); + softmaxLayer2->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(1)); + + // Adds an output layer. + armnn::IConnectableLayer* outputLayer = net.AddOutputLayer(0, "output layer"); + BOOST_TEST(outputLayer); + + mergerLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); + + BOOST_TEST(splitterLayer->GetNumOutputSlots() == 2); + BOOST_TEST(splitterLayer->GetOutputSlot(0).GetConnection(0) == &softmaxLayer1->GetInputSlot(0)); + BOOST_TEST(&splitterLayer->GetOutputSlot(0) == softmaxLayer1->GetInputSlot(0).GetConnection()); + BOOST_TEST(splitterLayer->GetOutputSlot(1).GetConnection(0) == &softmaxLayer2->GetInputSlot(0)); + BOOST_TEST(&splitterLayer->GetOutputSlot(1) == softmaxLayer2->GetInputSlot(0).GetConnection()); + + BOOST_TEST(mergerLayer->GetNumInputSlots() == 2); + BOOST_TEST(softmaxLayer1->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(0)); + BOOST_TEST(&softmaxLayer1->GetOutputSlot(0) == mergerLayer->GetInputSlot(0).GetConnection()); + BOOST_TEST(softmaxLayer2->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(1)); + BOOST_TEST(&softmaxLayer2->GetOutputSlot(0) == mergerLayer->GetInputSlot(1).GetConnection()); +} + +BOOST_AUTO_TEST_CASE(NetworkModification_SplitterAddition) +{ + armnn::Network net; + + // Adds an input layer and an input tensor descriptor. + armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer"); + BOOST_TEST(layer); + + // Adds a splitter layer. + armnn::ViewsDescriptor splitterDesc(2,4); + + armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer"); + BOOST_TEST(splitterLayer); + + layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0)); + + // Adds a softmax layer 1. + armnn::SoftmaxDescriptor softmaxDescriptor; + armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1"); + BOOST_TEST(softmax1Layer); + + splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0)); + + // Adds a softmax layer 2. + armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2"); + BOOST_TEST(softmax2Layer); + + splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0)); + + // Adds addition layer. + layer = net.AddAdditionLayer("add layer"); + BOOST_TEST(layer); + + softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); + + // Adds an output layer. + armnn::IConnectableLayer* prevLayer = layer; + layer = net.AddOutputLayer(0, "output layer"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + + BOOST_TEST(layer); +} + +BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMultiplication) +{ + armnn::Network net; + + // Adds an input layer and an input tensor descriptor. + armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer"); + BOOST_TEST(layer); + + // Adds a splitter layer. + armnn::ViewsDescriptor splitterDesc(2,4); + armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer"); + BOOST_TEST(splitterLayer); + + layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0)); + + // Adds a softmax layer 1. + armnn::SoftmaxDescriptor softmaxDescriptor; + armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1"); + BOOST_TEST(softmax1Layer); + + splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0)); + + // Adds a softmax layer 2. + armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2"); + BOOST_TEST(softmax2Layer); + + splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0)); + + // Adds multiplication layer. + layer = net.AddMultiplicationLayer("multiplication layer"); + BOOST_TEST(layer); + + softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); + + // Adds an output layer. + armnn::IConnectableLayer* prevLayer = layer; + layer = net.AddOutputLayer(0, "output layer"); + BOOST_TEST(layer); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); +} + +BOOST_AUTO_TEST_CASE(OptimizeValidateCpuRefWorkloads) +{ + const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32); + + armnn::Network net; + + armnn::NormalizationDescriptor nmDesc; + armnn::ActivationDescriptor acDesc; + + // in + // | + // nm + // / | + // ac | + // \ | + // ml + // | + // sm + // | + // ot + armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in"); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm"); + + layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0)); + normLayer->GetOutputSlot(0).SetTensorInfo(desc); + + layer = net.AddActivationLayer(acDesc, "ac"); + + normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + armnn::IConnectableLayer* prevLayer = layer; + layer = net.AddMultiplicationLayer("ml"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + prevLayer = layer; + armnn::SoftmaxDescriptor softmaxDescriptor; + layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + prevLayer = layer; + layer = net.AddOutputLayer(0, "ot"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::CpuRef }; + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec()); + static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph().AllocateDynamicBuffers(); + BOOST_CHECK(optNet); + + // Validates workloads. + armnn::RefWorkloadFactory fact; + for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) + { + BOOST_CHECK_NO_THROW( + layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact)); + } +} + +#if ARMCOMPUTENEON_ENABLED +BOOST_AUTO_TEST_CASE(OptimizeValidateCpuAccDeviceSupportLayerNoFallback) +{ + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc }; + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(optNet); + // validate workloads + armnn::NeonWorkloadFactory fact; + for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuAcc, layer->GetComputeDevice()); + BOOST_CHECK_NO_THROW( + layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact)); + } +} +#endif // ARMCOMPUTENEON_ENABLED + +#if ARMCOMPUTECL_ENABLED +BOOST_AUTO_TEST_CASE(OptimizeValidateGpuDeviceSupportLayerNoFallback) +{ + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::GpuAcc }; + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(optNet); + // validate workloads + armnn::ClWorkloadFactory fact; + for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) + { + BOOST_CHECK_EQUAL(armnn::Compute::GpuAcc, layer->GetComputeDevice()); + BOOST_CHECK_NO_THROW( + layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact)); + } +} +#endif // ARMCOMPUTECL_ENABLED + +BOOST_AUTO_TEST_CASE(OptimizeValidateDeviceNonSupportLayerNoFallback) +{ + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + // This layer configuration isn't supported by CpuAcc and isn't allowed to fall back, so Optimize will return null. + armnn::NormalizationDescriptor descriptor; + armnn::IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0)); + normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + normalize->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc }; + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(!optNet); +} + +BOOST_AUTO_TEST_CASE(OptimizeValidateDeviceNonSupportLayerWithFallback) +{ + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + // This layer configuration isn't supported by CpuAcc but it allows to fallback to CpuRef. + armnn::NormalizationDescriptor descriptor; + armnn::IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0)); + normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + normalize->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc, armnn::Compute::CpuRef }; + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_REQUIRE(optNet); + + for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) + { + // If NEON is enabled, Input and Output layers are supported by CpuAcc, + // the other layers are supported by CpuRef. + // If NEON is not enabled, all layers are supported by CpuRef. +#if ARMCOMPUTENEON_ENABLED + if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuAcc, layer->GetComputeDevice()); + } + else if (layer->GetType() == armnn::LayerType::Normalization) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice()); + } +#else + BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice()); +#endif + } +} + +BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsUndefinedComputeDevice) +{ + const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32); + + armnn::Network net; + + armnn::NormalizationDescriptor nmDesc; + armnn::ActivationDescriptor acDesc; + + // in + // | + // nm + // / | + // ac | + // \ | + // ml + // | + // sm + // | + // ot + armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in"); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm"); + + layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0)); + normLayer->GetOutputSlot(0).SetTensorInfo(desc); + + layer = net.AddActivationLayer(acDesc, "ac"); + + normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + armnn::IConnectableLayer* prevLayer = layer; + layer = net.AddMultiplicationLayer("ml"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + prevLayer = layer; + armnn::SoftmaxDescriptor softmaxDescriptor; + layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + prevLayer = layer; + layer = net.AddOutputLayer(0, "ot"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::Undefined }; + + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(!optNet); + +} + +BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsUndefinedComputeDeviceWithFallback) +{ + const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32); + + armnn::Network net; + + armnn::NormalizationDescriptor nmDesc; + armnn::ActivationDescriptor acDesc; + + // in + // | + // nm + // / | + // ac | + // \ | + // ml + // | + // sm + // | + // ot + armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in"); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm"); + + layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0)); + normLayer->GetOutputSlot(0).SetTensorInfo(desc); + + layer = net.AddActivationLayer(acDesc, "ac"); + + normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + armnn::IConnectableLayer* prevLayer = layer; + layer = net.AddMultiplicationLayer("ml"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + prevLayer = layer; + armnn::SoftmaxDescriptor softmaxDescriptor; + layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + layer->GetOutputSlot(0).SetTensorInfo(desc); + + prevLayer = layer; + layer = net.AddOutputLayer(0, "ot"); + + prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::Undefined, armnn::Compute::CpuRef }; + + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(optNet); + + // validate workloads + armnn::RefWorkloadFactory fact; + for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice()); + BOOST_CHECK_NO_THROW( + layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact)); + } +} +BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsDuplicateComputeDeviceWithFallback) +{ + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + // This layer configuration isn't supported by CpuAcc but it allows to fallback to CpuRef. + armnn::NormalizationDescriptor descriptor; + armnn::IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0)); + normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + normalize->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc, + armnn::Compute::GpuAcc, + armnn::Compute::CpuRef }; + + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_REQUIRE(optNet); + + for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) + { + // If NEON is enabled, Input and Output layers are supported by CpuAcc, + // the other layers are supported by CpuRef. + // If only CL is enabled, Input and Output layers are supported by GpuAcc, + // the other layers are supported by CpuRef. + // If neither NEON, nor CL is enabled, all layers are supported by CpuRef. +#if ARMCOMPUTENEON_ENABLED + if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuAcc, layer->GetComputeDevice()); + } + else if (layer->GetType() == armnn::LayerType::Normalization) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice()); + } +#elif ARMCOMPUTECL_ENABLED + if (layer->GetType() == armnn::LayerType::Input || layer->GetType() == armnn::LayerType::Output) + { + BOOST_CHECK_EQUAL(armnn::Compute::GpuAcc, layer->GetComputeDevice()); + } + else if (layer->GetType() == armnn::LayerType::Normalization) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice()); + } +#else + BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice()); +#endif + } +} + +BOOST_AUTO_TEST_CASE(OptimizeValidateWorkloadsCpuRefPermuteLayer) +{ + // Create runtime in which test will run + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + armnn::PermuteDescriptor descriptor({0, 2, 3, 1}); + armnn::IConnectableLayer* permute = net->AddPermuteLayer(descriptor); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(permute->GetInputSlot(0)); + permute->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + permute->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 4, 1, 4 }, armnn::DataType::Float32)); + + // optimize the network + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + + for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) + { + BOOST_CHECK_EQUAL(armnn::Compute::CpuRef, layer->GetComputeDevice()); + } +} + +BOOST_AUTO_TEST_CASE(FP16TurboModeTestOnCpuRef) +{ + // Test to check when FP16 Turbo mode set + // it converts the FP32 network to FP16 Network + // add FP32ToFP16 conversion layer after the InputLayer + // add FP16ToFP32 conversion layer after the OutputLayer + // checks the other layers if they are supported in FP16 + // if they are not put the conversion layers before and after + // if they are not supported in FP16 use FP32 instead + // if there are inverse conversion layers remove them with optimization + // at the moment FloorLayer is not supported in FP16 so it rolls back to FP32 + // and inverse conversion layers are removed by the optimizer + armnn::Network net; + + // Defines layers. + auto input = net.AddInputLayer(0); + auto floor = net.AddFloorLayer(); + auto output = net.AddOutputLayer(0); + + // Connects layers. + input->GetOutputSlot(0).Connect(floor->GetInputSlot(0)); + floor->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + armnn::TensorShape shape({4}); + armnn::TensorInfo info(shape, armnn::DataType::Float32); + input->GetOutputSlot(0).SetTensorInfo(info); + floor->GetOutputSlot(0).SetTensorInfo(info); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + + armnn::OptimizerOptions optimizerOptions; + optimizerOptions.m_ReduceFp32ToFp16 = true; + + armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec(), + optimizerOptions); + + std::ostringstream ss; + optimizedNet->SerializeToDot(ss); + + auto inputId = input->GetGuid(); + auto floorId = floor->GetGuid(); + auto outputId = output->GetGuid(); + + std::stringstream expected; + expected << + "digraph Optimized {\n" + " node [shape=\"record\"];\n" + " edge [fontsize=8 fontcolor=\"blue\" fontname=\"arial-bold\"];\n" + " " << inputId << " [label=\"{Input}\"];\n" + " " << floorId << " [label=\"{Floor}\"];\n" + " " << outputId << " [label=\"{Output}\"];\n" + " " << inputId << " -> " << floorId << " [label=< [4] >];\n" + " " << floorId << " -> " << outputId << " [label=< [4] >];\n" + "}\n"; + + BOOST_TEST(ss.str() == expected.str()); +} + +#if ARMCOMPUTECL_ENABLED +BOOST_AUTO_TEST_CASE(FP16TurboModeTestOnGpuAcc) +{ + // Test to check when Fp16 Turbo mode set + // it converts the Fp32 network to Fp16 Network + // add Fp32ToFp16 conversion layer after the InputLayer + // add Fp16ToFp32 conversion layer after the OutputLayer + // checks the other layers if they are supported in Fp16 + // if they are not put the conversion layers before and after + // if they are not supported in Fp16 use Fp32 instead + // if there are inverse conversion layers remove them with optimization + // at the moment FloorLayer is not supported in Fp16 so it rolls back to Fp32 + // and inverse conversion layers are removed by the optimizer + armnn::Network net; + + // Defines layers. + auto input = net.AddInputLayer(0, "input layer"); + // ReLu1 + armnn::ActivationDescriptor activation1Descriptor; + activation1Descriptor.m_Function = armnn::ActivationFunction::BoundedReLu; + activation1Descriptor.m_A = 1.f; + activation1Descriptor.m_B = -1.f; + auto activation = net.AddActivationLayer(activation1Descriptor, "activation layer"); + auto output = net.AddOutputLayer(0, "output layer"); + + // Connects layers. + input->GetOutputSlot(0).Connect(activation->GetInputSlot(0)); + activation->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + armnn::TensorShape shape({4}); + armnn::TensorInfo info(shape, armnn::DataType::Float32); + input->GetOutputSlot(0).SetTensorInfo(info); + activation->GetOutputSlot(0).SetTensorInfo(info); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = {armnn::Compute::GpuAcc}; + + armnn::OptimizerOptions optimizerOptions; + optimizerOptions.m_ReduceFp32ToFp16 = true; + + armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, backends, runtime->GetDeviceSpec(), + optimizerOptions); + + const armnn::Graph& graph = static_cast<armnn::OptimizedNetwork*>(optimizedNet.get())->GetGraph(); + + // Tests that all layers are present in the graph. + BOOST_TEST(graph.GetNumLayers() == 5); + + // Tests that the vertices exist and have correct names. + BOOST_TEST(GraphHasNamedLayer(graph, "input layer")); + BOOST_TEST(GraphHasNamedLayer(graph, "convert_fp32_to_fp16-0-input layer")); + BOOST_TEST(GraphHasNamedLayer(graph, "activation layer")); + BOOST_TEST(GraphHasNamedLayer(graph, "convert_fp16_to_fp32-0-output layer")); + BOOST_TEST(GraphHasNamedLayer(graph, "output layer")); +} +#endif + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/Network_test.cpp b/src/armnn/test/Network_test.cpp deleted file mode 100644 index 057caa0505..0000000000 --- a/src/armnn/test/Network_test.cpp +++ /dev/null @@ -1,483 +0,0 @@ -// -// Copyright © 2017 Arm Ltd. All rights reserved. -// See LICENSE file in the project root for full license information. -// -#include <boost/test/unit_test.hpp> - -#include "armnn/ArmNN.hpp" -#include "Network.hpp" -#include "Graph.hpp" -#include "backends/RefWorkloadFactory.hpp" - -#include "GraphUtils.hpp" - -namespace -{ - -bool AreAllLayerInputSlotsConnected(const armnn::IConnectableLayer& layer) -{ - bool allConnected = true; - for (unsigned int i = 0; i < layer.GetNumInputSlots(); ++i) - { - const bool inputConnected = layer.GetInputSlot(i).GetConnection() != nullptr; - allConnected &= inputConnected; - } - return allConnected; -} - -} - -BOOST_AUTO_TEST_SUITE(Network) - -BOOST_AUTO_TEST_CASE(LayerGuids) -{ - armnn::Network net; - armnn::LayerGuid inputId = net.AddInputLayer(0)->GetGuid(); - armnn::LayerGuid addId = net.AddAdditionLayer()->GetGuid(); - armnn::LayerGuid outputId = net.AddOutputLayer(0)->GetGuid(); - - BOOST_TEST(inputId != addId); - BOOST_TEST(addId != outputId); - BOOST_TEST(inputId != outputId); -} - -BOOST_AUTO_TEST_CASE(SerializeToDot) -{ - armnn::Network net; - - //define layers - auto input = net.AddInputLayer(0); - auto add = net.AddAdditionLayer(); - auto output = net.AddOutputLayer(0); - - // connect layers - input->GetOutputSlot(0).Connect(add->GetInputSlot(0)); - input->GetOutputSlot(0).Connect(add->GetInputSlot(1)); - add->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - - armnn::TensorShape shape({4}); - armnn::TensorInfo info(shape, armnn::DataType::Float32); - input->GetOutputSlot(0).SetTensorInfo(info); - add->GetOutputSlot(0).SetTensorInfo(info); - - armnn::DeviceSpec spec; - spec.DefaultComputeDevice = armnn::Compute::CpuAcc; - armnn::IOptimizedNetworkPtr optimizedNet = armnn::Optimize(net, spec); - - std::ostringstream ss; - optimizedNet->SerializeToDot(ss); - - auto inputId = input->GetGuid(); - auto addId = add->GetGuid(); - auto outputId = output->GetGuid(); - - std::stringstream expected; - expected << - "digraph Optimized {\n" - " node [shape=\"record\"];\n" - " edge [fontsize=8 fontcolor=\"blue\" fontname=\"arial-bold\"];\n" - " " << inputId << " [label=\"{Input}\"];\n" - " " << addId << " [label=\"{Addition}\"];\n" - " " << outputId << " [label=\"{Output}\"];\n" - " " << inputId << " -> " << addId << " [label=< [4] >];\n" - " " << inputId << " -> " << addId << " [label=< [4] >];\n" - " " << addId << " -> " << outputId << " [label=< [4] >];\n" - "}\n"; - - BOOST_TEST(ss.str() == expected.str()); -} - -BOOST_AUTO_TEST_CASE(NetworkBasic) -{ - armnn::Network net; - BOOST_TEST(net.PrintGraph() == armnn::Status::Success); -} - -BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForINetwork) -{ - armnn::Network net; - armnn::INetwork& inet = net; - inet.AddInputLayer(0); - inet.AddAdditionLayer(); - inet.AddActivationLayer(armnn::ActivationDescriptor()); - inet.AddOutputLayer(0); -} - -BOOST_AUTO_TEST_CASE(LayerNamesAreOptionalForNetwork) -{ - armnn::Network net; - net.AddInputLayer(0); - net.AddAdditionLayer(); - net.AddActivationLayer(armnn::ActivationDescriptor()); - net.AddOutputLayer(0); -} - -BOOST_AUTO_TEST_CASE(NetworkModification) -{ - armnn::Network net; - - armnn::IConnectableLayer* const inputLayer = net.AddInputLayer(0, "input layer"); - BOOST_TEST(inputLayer); - - unsigned int dims[] = { 10,1,1,1 }; - std::vector<float> convWeightsData(10); - armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float32), convWeightsData); - - armnn::Convolution2dDescriptor convDesc2d; - armnn::IConnectableLayer* const convLayer = net.AddConvolution2dLayer(convDesc2d, weights, "conv layer"); - BOOST_TEST(convLayer); - - inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0)); - - armnn::FullyConnectedDescriptor fullyConnectedDesc; - armnn::IConnectableLayer* const fullyConnectedLayer = net.AddFullyConnectedLayer(fullyConnectedDesc, - weights, - "fully connected"); - BOOST_TEST(fullyConnectedLayer); - - convLayer->GetOutputSlot(0).Connect(fullyConnectedLayer->GetInputSlot(0)); - - armnn::Pooling2dDescriptor pooling2dDesc; - armnn::IConnectableLayer* const poolingLayer = net.AddPooling2dLayer(pooling2dDesc, "pooling2d"); - BOOST_TEST(poolingLayer); - - fullyConnectedLayer->GetOutputSlot(0).Connect(poolingLayer->GetInputSlot(0)); - - armnn::ActivationDescriptor activationDesc; - armnn::IConnectableLayer* const activationLayer = net.AddActivationLayer(activationDesc, "activation"); - BOOST_TEST(activationLayer); - - poolingLayer->GetOutputSlot(0).Connect(activationLayer->GetInputSlot(0)); - - armnn::NormalizationDescriptor normalizationDesc; - armnn::IConnectableLayer* const normalizationLayer = net.AddNormalizationLayer(normalizationDesc, "normalization"); - BOOST_TEST(normalizationLayer); - - activationLayer->GetOutputSlot(0).Connect(normalizationLayer->GetInputSlot(0)); - - armnn::SoftmaxDescriptor softmaxDesc; - armnn::IConnectableLayer* const softmaxLayer = net.AddSoftmaxLayer(softmaxDesc, "softmax"); - BOOST_TEST(softmaxLayer); - - normalizationLayer->GetOutputSlot(0).Connect(softmaxLayer->GetInputSlot(0)); - - armnn::BatchNormalizationDescriptor batchNormDesc; - - armnn::TensorInfo tensorInfo({ 1 }, armnn::DataType::Float32); - std::vector<float> data(tensorInfo.GetNumBytes() / sizeof(float)); - armnn::ConstTensor invalidTensor(tensorInfo, data); - - armnn::IConnectableLayer* const batchNormalizationLayer = net.AddBatchNormalizationLayer(batchNormDesc, - invalidTensor, - invalidTensor, - invalidTensor, - invalidTensor, - "batch norm"); - BOOST_TEST(batchNormalizationLayer); - - softmaxLayer->GetOutputSlot(0).Connect(batchNormalizationLayer->GetInputSlot(0)); - - armnn::IConnectableLayer* const additionLayer = net.AddAdditionLayer("addition"); - BOOST_TEST(additionLayer); - - batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(0)); - batchNormalizationLayer->GetOutputSlot(0).Connect(additionLayer->GetInputSlot(1)); - - armnn::IConnectableLayer* const multiplicationLayer = net.AddMultiplicationLayer("multiplication"); - BOOST_TEST(multiplicationLayer); - - additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(0)); - additionLayer->GetOutputSlot(0).Connect(multiplicationLayer->GetInputSlot(1)); - - armnn::IConnectableLayer* const outputLayer = net.AddOutputLayer(0, "output layer"); - BOOST_TEST(outputLayer); - - multiplicationLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); - - //Test that all layers are present in the graph - BOOST_TEST(net.GetGraph().GetNumLayers() == 11); - - //Test that the vertices exist and have correct names - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "input layer")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "conv layer")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "fully connected")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "pooling2d")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "activation")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "normalization")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "softmax")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "batch norm")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "addition")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "multiplication")); - BOOST_TEST(GraphHasNamedLayer(net.GetGraph(), "output layer")); - - auto checkOneOutputToOneInputConnection = [] - (const armnn::IConnectableLayer* const srcLayer, - const armnn::IConnectableLayer* const tgtLayer, - int expectedSrcNumInputs = 1, - int expectedDstNumOutputs = 1) - { - BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs); - BOOST_TEST(srcLayer->GetNumOutputSlots() == 1); - BOOST_TEST(tgtLayer->GetNumInputSlots() == 1); - BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs); - - BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 1); - BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(0) == &tgtLayer->GetInputSlot(0)); - BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(0).GetConnection()); - }; - auto checkOneOutputToTwoInputsConnections = [] - (const armnn::IConnectableLayer* const srcLayer, - const armnn::IConnectableLayer* const tgtLayer, - int expectedSrcNumInputs, - int expectedDstNumOutputs = 1) - { - BOOST_TEST(srcLayer->GetNumInputSlots() == expectedSrcNumInputs); - BOOST_TEST(srcLayer->GetNumOutputSlots() == 1); - BOOST_TEST(tgtLayer->GetNumInputSlots() == 2); - BOOST_TEST(tgtLayer->GetNumOutputSlots() == expectedDstNumOutputs); - - BOOST_TEST(srcLayer->GetOutputSlot(0).GetNumConnections() == 2); - for (unsigned int i = 0; i < srcLayer->GetOutputSlot(0).GetNumConnections(); ++i) - { - BOOST_TEST(srcLayer->GetOutputSlot(0).GetConnection(i) == &tgtLayer->GetInputSlot(i)); - BOOST_TEST(&srcLayer->GetOutputSlot(0) == tgtLayer->GetInputSlot(i).GetConnection()); - } - }; - - BOOST_TEST(AreAllLayerInputSlotsConnected(*convLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*fullyConnectedLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*poolingLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*activationLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*normalizationLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*softmaxLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*batchNormalizationLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*additionLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*multiplicationLayer)); - BOOST_TEST(AreAllLayerInputSlotsConnected(*outputLayer)); - - // Check connectivity - checkOneOutputToOneInputConnection(inputLayer, convLayer, 0); - checkOneOutputToOneInputConnection(convLayer, fullyConnectedLayer); - checkOneOutputToOneInputConnection(fullyConnectedLayer, poolingLayer); - checkOneOutputToOneInputConnection(poolingLayer, activationLayer); - checkOneOutputToOneInputConnection(activationLayer, normalizationLayer); - checkOneOutputToOneInputConnection(normalizationLayer, softmaxLayer); - checkOneOutputToOneInputConnection(softmaxLayer, batchNormalizationLayer); - checkOneOutputToTwoInputsConnections(batchNormalizationLayer, additionLayer, 1); - checkOneOutputToTwoInputsConnections(additionLayer, multiplicationLayer, 2); - checkOneOutputToOneInputConnection(multiplicationLayer, outputLayer, 2, 0); -} - -BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMerger) -{ - armnn::Network net; - - // Add an input layer and an input tensor descriptor. - armnn::IConnectableLayer* inputLayer = net.AddInputLayer(0, "input layer"); - BOOST_TEST(inputLayer); - - // Add a splitter layer - armnn::ViewsDescriptor splitterDesc(2,4); - - armnn::IConnectableLayer* splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer"); - BOOST_TEST(splitterLayer); - - inputLayer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0)); - - // Add a softmax layer 1 - armnn::SoftmaxDescriptor softmaxDescriptor; - armnn::IConnectableLayer* softmaxLayer1 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1"); - BOOST_TEST(softmaxLayer1); - - splitterLayer->GetOutputSlot(0).Connect(softmaxLayer1->GetInputSlot(0)); - - // Add a softmax layer 2 - armnn::IConnectableLayer* softmaxLayer2 = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2"); - BOOST_TEST(softmaxLayer2); - - splitterLayer->GetOutputSlot(1).Connect(softmaxLayer2->GetInputSlot(0)); - - // Add a merger layer - armnn::OriginsDescriptor mergerDesc(2, 4); - - armnn::IConnectableLayer* mergerLayer = net.AddMergerLayer(mergerDesc, "merger layer"); - BOOST_TEST(mergerLayer); - - softmaxLayer1->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(0)); - softmaxLayer2->GetOutputSlot(0).Connect(mergerLayer->GetInputSlot(1)); - - // Add an output layer - armnn::IConnectableLayer* outputLayer = net.AddOutputLayer(0, "output layer"); - BOOST_TEST(outputLayer); - - mergerLayer->GetOutputSlot(0).Connect(outputLayer->GetInputSlot(0)); - - BOOST_TEST(splitterLayer->GetNumOutputSlots() == 2); - BOOST_TEST(splitterLayer->GetOutputSlot(0).GetConnection(0) == &softmaxLayer1->GetInputSlot(0)); - BOOST_TEST(&splitterLayer->GetOutputSlot(0) == softmaxLayer1->GetInputSlot(0).GetConnection()); - BOOST_TEST(splitterLayer->GetOutputSlot(1).GetConnection(0) == &softmaxLayer2->GetInputSlot(0)); - BOOST_TEST(&splitterLayer->GetOutputSlot(1) == softmaxLayer2->GetInputSlot(0).GetConnection()); - - BOOST_TEST(mergerLayer->GetNumInputSlots() == 2); - BOOST_TEST(softmaxLayer1->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(0)); - BOOST_TEST(&softmaxLayer1->GetOutputSlot(0) == mergerLayer->GetInputSlot(0).GetConnection()); - BOOST_TEST(softmaxLayer2->GetOutputSlot(0).GetConnection(0) == &mergerLayer->GetInputSlot(1)); - BOOST_TEST(&softmaxLayer2->GetOutputSlot(0) == mergerLayer->GetInputSlot(1).GetConnection()); -} - -BOOST_AUTO_TEST_CASE(NetworkModification_SplitterAddition) -{ - armnn::Network net; - - // Add an input layer and an input tensor descriptor. - armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer"); - BOOST_TEST(layer); - - // Add a splitter layer - armnn::ViewsDescriptor splitterDesc(2,4); - - armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer"); - BOOST_TEST(splitterLayer); - - layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0)); - - // Add a softmax layer 1 - armnn::SoftmaxDescriptor softmaxDescriptor; - armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1"); - BOOST_TEST(softmax1Layer); - - splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0)); - - // Add a softmax layer 2 - armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2"); - BOOST_TEST(softmax2Layer); - - splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0)); - - // Add addition layer - layer = net.AddAdditionLayer("add layer"); - BOOST_TEST(layer); - - softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); - softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); - - // Add an output layer - armnn::IConnectableLayer* prevLayer = layer; - layer = net.AddOutputLayer(0, "output layer"); - - prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); - - BOOST_TEST(layer); -} - -BOOST_AUTO_TEST_CASE(NetworkModification_SplitterMultiplication) -{ - armnn::Network net; - - // Add an input layer and an input tensor descriptor. - armnn::IConnectableLayer* layer = net.AddInputLayer(0, "input layer"); - BOOST_TEST(layer); - - // Add a splitter layer - armnn::ViewsDescriptor splitterDesc(2,4); - armnn::IConnectableLayer* const splitterLayer = net.AddSplitterLayer(splitterDesc, "splitter layer"); - BOOST_TEST(splitterLayer); - - layer->GetOutputSlot(0).Connect(splitterLayer->GetInputSlot(0)); - - // Add a softmax layer 1 - armnn::SoftmaxDescriptor softmaxDescriptor; - armnn::IConnectableLayer* const softmax1Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_1"); - BOOST_TEST(softmax1Layer); - - splitterLayer->GetOutputSlot(0).Connect(softmax1Layer->GetInputSlot(0)); - - // Add a softmax layer 2 - armnn::IConnectableLayer* const softmax2Layer = net.AddSoftmaxLayer(softmaxDescriptor, "softmax_2"); - BOOST_TEST(softmax2Layer); - - splitterLayer->GetOutputSlot(1).Connect(softmax2Layer->GetInputSlot(0)); - - // Add multiplication layer - layer = net.AddMultiplicationLayer("multiplication layer"); - BOOST_TEST(layer); - - softmax1Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); - softmax2Layer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); - - // Add an output layer - armnn::IConnectableLayer* prevLayer = layer; - layer = net.AddOutputLayer(0, "output layer"); - BOOST_TEST(layer); - - prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); -} - -BOOST_AUTO_TEST_CASE(ValidateWorkloads) -{ - const armnn::TensorInfo desc({3, 5}, armnn::DataType::Float32); - - armnn::Network net; - - armnn::NormalizationDescriptor nmDesc; - armnn::ActivationDescriptor acDesc; - - // in - // | - // nm - // / | - // ac | - // \ | - // ml - // | - // sm - // | - // ot - armnn::IConnectableLayer* layer = net.AddInputLayer(0, "in"); - layer->GetOutputSlot(0).SetTensorInfo(desc); - - armnn::IConnectableLayer* const normLayer = net.AddNormalizationLayer(nmDesc, "nm"); - - layer->GetOutputSlot(0).Connect(normLayer->GetInputSlot(0)); - normLayer->GetOutputSlot(0).SetTensorInfo(desc); - - layer = net.AddActivationLayer(acDesc, "ac"); - - normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); - layer->GetOutputSlot(0).SetTensorInfo(desc); - - armnn::IConnectableLayer* prevLayer = layer; - layer = net.AddMultiplicationLayer("ml"); - - prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); - normLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(1)); - layer->GetOutputSlot(0).SetTensorInfo(desc); - - prevLayer = layer; - armnn::SoftmaxDescriptor softmaxDescriptor; - layer = net.AddSoftmaxLayer(softmaxDescriptor, "sm"); - - prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); - layer->GetOutputSlot(0).SetTensorInfo(desc); - - prevLayer = layer; - layer = net.AddOutputLayer(0, "ot"); - - prevLayer->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); - - armnn::DeviceSpec spec; - spec.DefaultComputeDevice = armnn::Compute::CpuRef; - - armnn::IOptimizedNetworkPtr optNet = Optimize(net, spec); - static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph().AllocateDynamicBuffers(); - - // validate workloads - armnn::RefWorkloadFactory fact; - for (auto&& layer : static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph()) - { - BOOST_CHECK_NO_THROW( - layer->CreateWorkload(static_cast<armnn::OptimizedNetwork*>(optNet.get())->GetGraph(), fact)); - } -} - -BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/ObservableTest.cpp b/src/armnn/test/ObservableTest.cpp new file mode 100644 index 0000000000..6588f3469e --- /dev/null +++ b/src/armnn/test/ObservableTest.cpp @@ -0,0 +1,94 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include <boost/test/unit_test.hpp> + +#include "Graph.hpp" +#include "Observable.hpp" + +BOOST_AUTO_TEST_SUITE(Observable) + +BOOST_AUTO_TEST_CASE(AddedLayerObservableTest) +{ + armnn::Graph graph; + + // Create a graph observable + armnn::AddedLayerObservable layerObservable(graph); + + // Add a few layers + auto output = graph.AddLayer<armnn::OutputLayer>(0, "output"); + auto input = graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input"); + + // Check the observable has observed the changes + std::list<armnn::Layer*> testLayers({ output, input }); + + BOOST_CHECK_EQUAL_COLLECTIONS(layerObservable.begin(), layerObservable.end(), + testLayers.begin(), testLayers.end()); +} + +BOOST_AUTO_TEST_CASE(ClearAddedLayerObservableTest) +{ + armnn::Graph graph; + + // Create a graph observable + armnn::AddedLayerObservable addedLayerObservable(graph); + + // Add a few layers + auto output = graph.AddLayer<armnn::OutputLayer>(0, "output"); + graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input"); + + addedLayerObservable.Clear(); + + // Check the observable has observed the changes + std::list<armnn::Layer*> emptyList({}); + + BOOST_CHECK_EQUAL_COLLECTIONS(addedLayerObservable.begin(), addedLayerObservable.end(), + emptyList.begin(), emptyList.end()); +} + +BOOST_AUTO_TEST_CASE(ErasedLayerNamesObservableTest) +{ + armnn::Graph graph; + + // Create a graph observable + armnn::ErasedLayerNamesObservable erasedLayerNamesObservable(graph); + + // Add a few layers + auto output = graph.AddLayer<armnn::OutputLayer>(0, "output"); + graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input"); + + graph.EraseLayer(output); + + // Check the observable has observed the changes + std::list<std::string> testList({"output"}); + + BOOST_CHECK_EQUAL_COLLECTIONS(erasedLayerNamesObservable.begin(), erasedLayerNamesObservable.end(), + testList.begin(), testList.end()); +} + +BOOST_AUTO_TEST_CASE(ClearErasedLayerNamesObservableTest) +{ + armnn::Graph graph; + + // Create a graph observable + armnn::ErasedLayerNamesObservable erasedLayerNamesObservable(graph); + + // Add a few layers + auto output = graph.AddLayer<armnn::OutputLayer>(0, "output"); + graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input"); + + graph.EraseLayer(output); + + erasedLayerNamesObservable.Clear(); + + // Check the observable has observed the changes + std::list<std::string> emptyList({}); + + BOOST_CHECK_EQUAL_COLLECTIONS(erasedLayerNamesObservable.begin(), erasedLayerNamesObservable.end(), + emptyList.begin(), emptyList.end()); +} + +BOOST_AUTO_TEST_SUITE_END() + diff --git a/src/armnn/test/OpenClTimerTest.cpp b/src/armnn/test/OpenClTimerTest.cpp new file mode 100644 index 0000000000..b8dea8ebe0 --- /dev/null +++ b/src/armnn/test/OpenClTimerTest.cpp @@ -0,0 +1,137 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#if (defined(__aarch64__)) || (defined(__x86_64__)) // disable test failing on FireFly/Armv7 + +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "backends/ClContextControl.hpp" +#include "backends/ClWorkloadFactory.hpp" +#include "backends/CpuTensorHandle.hpp" +#include <boost/format.hpp> +#include <iostream> +#include "OpenClTimer.hpp" +#include "backends/test/TensorCopyUtils.hpp" +#include "TensorHelpers.hpp" +#include <boost/test/unit_test.hpp> +#include "backends/WorkloadFactory.hpp" +#include "backends/test/WorkloadTestUtils.hpp" + +using namespace armnn; + +struct OpenClFixture +{ + // Initialising ClContextControl to ensure OpenCL is loaded correctly for each test case. + // NOTE: Profiling needs to be enabled in ClContextControl to be able to obtain execution + // times from OpenClTimer. + OpenClFixture() : m_ClContextControl(nullptr, true) {} + ~OpenClFixture() {} + + ClContextControl m_ClContextControl; +}; + +BOOST_FIXTURE_TEST_SUITE(OpenClTimerBatchNorm, OpenClFixture) +using FactoryType = ClWorkloadFactory; + +BOOST_AUTO_TEST_CASE(OpenClTimerBatchNorm) +{ + ClWorkloadFactory workloadFactory; + + const unsigned int width = 2; + const unsigned int height = 3; + const unsigned int channels = 2; + const unsigned int num = 1; + int32_t qOffset = 0; + float qScale = 0.f; + + TensorInfo inputTensorInfo({num, channels, height, width}, GetDataType<float>()); + TensorInfo outputTensorInfo({num, channels, height, width}, GetDataType<float>()); + TensorInfo tensorInfo({channels}, GetDataType<float>()); + + // Set quantization parameters if the requested type is a quantized type. + if(IsQuantizedType<float>()) + { + inputTensorInfo.SetQuantizationScale(qScale); + inputTensorInfo.SetQuantizationOffset(qOffset); + outputTensorInfo.SetQuantizationScale(qScale); + outputTensorInfo.SetQuantizationOffset(qOffset); + tensorInfo.SetQuantizationScale(qScale); + tensorInfo.SetQuantizationOffset(qOffset); + } + + auto input = MakeTensor<float, 4>(inputTensorInfo, + QuantizedVector<float>(qScale, qOffset, + { + 1.f, 4.f, + 4.f, 2.f, + 1.f, 6.f, + + 1.f, 1.f, + 4.f, 1.f, + -2.f, 4.f + })); + // these values are per-channel of the input + auto mean = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, -2})); + auto variance = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {4, 9})); + auto beta = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {3, 2})); + auto gamma = MakeTensor<float, 1>(tensorInfo, QuantizedVector<float>(qScale, qOffset, {2, 1})); + + std::unique_ptr<ITensorHandle> inputHandle = workloadFactory.CreateTensorHandle(inputTensorInfo); + std::unique_ptr<ITensorHandle> outputHandle = workloadFactory.CreateTensorHandle(outputTensorInfo); + + BatchNormalizationQueueDescriptor data; + WorkloadInfo info; + ScopedCpuTensorHandle meanTensor(tensorInfo); + ScopedCpuTensorHandle varianceTensor(tensorInfo); + ScopedCpuTensorHandle betaTensor(tensorInfo); + ScopedCpuTensorHandle gammaTensor(tensorInfo); + + AllocateAndCopyDataToITensorHandle(&meanTensor, &mean[0]); + AllocateAndCopyDataToITensorHandle(&varianceTensor, &variance[0]); + AllocateAndCopyDataToITensorHandle(&betaTensor, &beta[0]); + AllocateAndCopyDataToITensorHandle(&gammaTensor, &gamma[0]); + + AddInputToWorkload(data, info, inputTensorInfo, inputHandle.get()); + AddOutputToWorkload(data, info, outputTensorInfo, outputHandle.get()); + data.m_Mean = &meanTensor; + data.m_Variance = &varianceTensor; + data.m_Beta = &betaTensor; + data.m_Gamma = &gammaTensor; + data.m_Parameters.m_Eps = 0.0f; + + // for each channel: + // substract mean, divide by standard deviation (with an epsilon to avoid div by 0) + // multiply by gamma and add beta + std::unique_ptr<IWorkload> workload = workloadFactory.CreateBatchNormalization(data, info); + + inputHandle->Allocate(); + outputHandle->Allocate(); + + CopyDataToITensorHandle(inputHandle.get(), &input[0][0][0][0]); + + OpenClTimer openClTimer; + + BOOST_CHECK_EQUAL(openClTimer.GetName(), "OpenClKernelTimer"); + + //Start the timer + openClTimer.Start(); + + //Execute the workload + workload->Execute(); + + //Stop the timer + openClTimer.Stop(); + + BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().size(), 1); + + BOOST_CHECK_EQUAL(openClTimer.GetMeasurements().front().m_Name, + "OpenClKernelTimer/0: batchnormalization_layer_nchw GWS[1,3,2]"); + + BOOST_CHECK(openClTimer.GetMeasurements().front().m_Value > 0); + +} + +BOOST_AUTO_TEST_SUITE_END() + +#endif //aarch64 or x86_64
\ No newline at end of file diff --git a/src/armnn/test/OptimizerTests.cpp b/src/armnn/test/OptimizerTests.cpp index da26fba76e..0c1a2619b2 100644 --- a/src/armnn/test/OptimizerTests.cpp +++ b/src/armnn/test/OptimizerTests.cpp @@ -7,6 +7,8 @@ #include "armnn/ArmNN.hpp" #include "Graph.hpp" #include "Optimizer.hpp" +#include "backends/CpuTensorHandle.hpp" +#include "FloatingPointConverter.hpp" namespace { @@ -21,7 +23,7 @@ bool CheckSequence(const armnn::Graph::ConstIterator first, const armnn::Graph:: return (first == last); } -/// Check each unary function in Us evaluates true for each correspondent layer in the sequence [first, last) +/// Checks each unary function in Us evaluates true for each correspondent layer in the sequence [first, last). template <typename U, typename... Us> bool CheckSequence(const armnn::Graph::ConstIterator first, const armnn::Graph::ConstIterator last, @@ -30,11 +32,149 @@ bool CheckSequence(const armnn::Graph::ConstIterator first, { return u(*first) && CheckSequence(std::next(first), last, us...); } + +template <typename LayerT> +bool CheckRelatedLayers(armnn::Graph& graph, const std::list<std::string>& testRelatedLayers) +{ + for (auto& layer : graph) + { + if (layer->GetType() == armnn::LayerEnumOf<LayerT>()) + { + auto& relatedLayers = layer->GetRelatedLayerNames(); + if(!std::equal(relatedLayers.begin(), relatedLayers.end(), + testRelatedLayers.begin(), testRelatedLayers.end())) + { + return false; + } + } + } + + return true; +} + +// connects two layers +using namespace armnn; +void Connect(Layer* from, Layer* to, const TensorInfo& tensorInfo, unsigned int fromIndex = 0, unsigned int toIndex = 0) +{ + from->GetOutputSlot(fromIndex).Connect(to->GetInputSlot(toIndex)); + from->GetOutputHandler(fromIndex).SetTensorInfo(tensorInfo); +} + +void CreateLSTMLayerHelper(Graph &graph, bool CifgEnabled) +{ + LstmDescriptor layerDesc; + layerDesc.m_ActivationFunc = 4; + layerDesc.m_ClippingThresCell = 0.2f; + layerDesc.m_ClippingThresProj = 0.4f; + layerDesc.m_CifgEnabled = CifgEnabled; + layerDesc.m_PeepholeEnabled = false; + layerDesc.m_ProjectionEnabled = false; + + LstmLayer* const layer = graph.AddLayer<LstmLayer>(layerDesc, "layer"); + unsigned int batchSize = 3; + unsigned int inputSize = 2; + unsigned int numUnits = 4; + unsigned int outputSize = 4; + + layer->m_BasicParameters.m_InputToForgetWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_InputToCellWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_InputToOutputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToForgetWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToCellWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_RecurrentToOutputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_BasicParameters.m_ForgetGateBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_BasicParameters.m_CellBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_BasicParameters.m_OutputGateBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + + layer->m_BasicParameters.m_InputToForgetWeights->Allocate(); + layer->m_BasicParameters.m_InputToCellWeights->Allocate(); + layer->m_BasicParameters.m_InputToOutputWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToForgetWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToCellWeights->Allocate(); + layer->m_BasicParameters.m_RecurrentToOutputWeights->Allocate(); + layer->m_BasicParameters.m_ForgetGateBias->Allocate(); + layer->m_BasicParameters.m_CellBias->Allocate(); + layer->m_BasicParameters.m_OutputGateBias->Allocate(); + + if (!layerDesc.m_CifgEnabled) + { + layer->m_CifgParameters.m_InputToInputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, inputSize }, DataType::Float32)); + layer->m_CifgParameters.m_RecurrentToInputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits, outputSize }, DataType::Float32)); + layer->m_CifgParameters.m_CellToInputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_CifgParameters.m_InputGateBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_CifgParameters.m_InputToInputWeights->Allocate(); + layer->m_CifgParameters.m_RecurrentToInputWeights->Allocate(); + layer->m_CifgParameters.m_CellToInputWeights->Allocate(); + layer->m_CifgParameters.m_InputGateBias->Allocate(); + } + + if (layerDesc.m_ProjectionEnabled) + { + layer->m_ProjectionParameters.m_ProjectionWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ outputSize, numUnits }, DataType::Float32)); + layer->m_ProjectionParameters.m_ProjectionBias = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ outputSize }, DataType::Float32)); + layer->m_ProjectionParameters.m_ProjectionWeights->Allocate(); + layer->m_ProjectionParameters.m_ProjectionBias->Allocate(); + } + + if (layerDesc.m_PeepholeEnabled) + { + layer->m_PeepholeParameters.m_CellToForgetWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_PeepholeParameters.m_CellToOutputWeights = std::make_unique<ScopedCpuTensorHandle> + (TensorInfo({ numUnits }, DataType::Float32)); + layer->m_PeepholeParameters.m_CellToForgetWeights->Allocate(); + layer->m_PeepholeParameters.m_CellToOutputWeights->Allocate(); + } + + // create input and output layers + Layer* const input = graph.AddLayer<InputLayer>(0, "input"); + Layer* const outputStateIn = graph.AddLayer<InputLayer>(1, "outputStateIn"); + Layer* const cellStateIn = graph.AddLayer<InputLayer>(2, "cellStateIn"); + Layer* const scratchBuffer = graph.AddLayer<OutputLayer>(0, "scratchBuffer"); + Layer* const outputStateOut = graph.AddLayer<OutputLayer>(1, "outputStateOut"); + Layer* const cellStateOut = graph.AddLayer<OutputLayer>(2, "cellStateOut"); + Layer* const output = graph.AddLayer<OutputLayer>(3, "output"); + + // connect up + armnn::TensorInfo lstmTensorInfo1({ batchSize, inputSize }, DataType::Float32); + armnn::TensorInfo lstmTensorInfo2({ batchSize, numUnits}, DataType::Float32); + armnn::TensorInfo lstmTensorInfo3({ batchSize, outputSize }, DataType::Float32); + armnn::TensorInfo lstmTensorInfoScratchBuff({ batchSize, numUnits*3 }, DataType::Float32); + if (layerDesc.m_CifgEnabled) + { + lstmTensorInfoScratchBuff.SetShape({ batchSize, numUnits*4 }); + } + + Connect(input, layer, lstmTensorInfo1, 0, 0); + Connect(cellStateIn, layer, lstmTensorInfo2, 0, 1); + Connect(outputStateIn, layer, lstmTensorInfo3, 0, 2); + Connect(layer, scratchBuffer, lstmTensorInfoScratchBuff, 0, 0); + Connect(layer, outputStateOut, lstmTensorInfo3, 1, 0); + Connect(layer, cellStateOut, lstmTensorInfo2, 2, 0); + Connect(layer, output, lstmTensorInfo3, 3, 0); +} + } BOOST_AUTO_TEST_SUITE(Optimizer) +using namespace armnn::optimizations; -BOOST_AUTO_TEST_CASE(OptimizeInversePermutes) +BOOST_AUTO_TEST_CASE(OptimizeInversePermutesTest) { armnn::Graph graph; @@ -42,7 +182,7 @@ BOOST_AUTO_TEST_CASE(OptimizeInversePermutes) graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input"); - // Insert two permutes, one the inverse of the other + // Inserts two permutes, one the inverse of the other. graph.InsertNewLayer<armnn::PermuteLayer>(output->GetInputSlot(0), armnn::PermuteDescriptor({0, 2, 3, 1}), "perm0231"); @@ -57,16 +197,38 @@ BOOST_AUTO_TEST_CASE(OptimizeInversePermutes) &IsLayerOfType<armnn::PermuteLayer>, &IsLayerOfType<armnn::OutputLayer>)); - armnn::Optimizer::Optimize(graph); + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeInversePermutes())); - // The permutes are removed + // The permutes are removed. BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>, &IsLayerOfType<armnn::OutputLayer>)); } -BOOST_AUTO_TEST_CASE(MovePermuteUp) +BOOST_AUTO_TEST_CASE(LSTMValidateTensorShapesFromInputsCIFGDisabledTest) +{ + Graph graph; + + //Helper function creates graph containing LSTM layer with required input and output layers + CreateLSTMLayerHelper(graph, false); + + //This function used to call ValidateShapesFromInputs(); + BOOST_CHECK_NO_THROW(graph.InferTensorInfos()); +} + +BOOST_AUTO_TEST_CASE(LSTMValidateTensorShapesFromInputsCIFGEnabledTest) +{ + Graph graph; + + //Helper function creates graph containing LSTM layer with required input and output layers + CreateLSTMLayerHelper(graph, true); + + //This function used to call ValidateShapesFromInputs(); + BOOST_CHECK_NO_THROW(graph.InferTensorInfos()); +} + +BOOST_AUTO_TEST_CASE(MovePermuteUpTest) { const armnn::TensorInfo info({ 1, 5, 2, 3 }, armnn::DataType::Float32); const armnn::TensorInfo permuted({ 1, 3, 5, 2 }, armnn::DataType::Float32); @@ -77,12 +239,16 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp) armnn::Layer* head = graph.AddLayer<armnn::OutputLayer>(0, "output"); + std::string permuteLayerName = "original_permute"; + // Insert permute head = graph.InsertNewLayer<armnn::PermuteLayer>(head->GetInputSlot(0), - armnn::PermuteDescriptor({ 0, 2, 3, 1 }), ""); + armnn::PermuteDescriptor({ 0, 2, 3, 1 }), + permuteLayerName.c_str()); + head->GetOutputHandler().SetTensorInfo(permuted); - // Insert layers that don't care about data format + // Inserts layers that don't care about data format. head = graph.InsertNewLayer<armnn::ActivationLayer>(head->GetInputSlot(0), armnn::ActivationDescriptor{}, ""); head->GetOutputHandler().SetTensorInfo(info); @@ -90,7 +256,7 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp) head = graph.InsertNewLayer<armnn::AdditionLayer>(head->GetInputSlot(0), ""); head->GetOutputHandler().SetTensorInfo(info); - // Insert input for 2nd input of Addition + // Inserts input for 2nd input of Addition. graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "") ->GetOutputHandler().SetTensorInfo(info); @@ -107,11 +273,11 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp) head = graph.InsertNewLayer<armnn::MultiplicationLayer>(head->GetInputSlot(0), ""); head->GetOutputHandler().SetTensorInfo(info); - // Insert input for 2nd input of Multiplication + // Inserts input for 2nd input of Multiplication. graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "") ->GetOutputHandler().SetTensorInfo(info); - // Insert input + // Inserts input. graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(0), inputId++, "") ->GetOutputHandler().SetTensorInfo(info); @@ -129,9 +295,9 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp) &IsLayerOfType<armnn::PermuteLayer>, &IsLayerOfType<armnn::OutputLayer>)); - armnn::Optimizer::Optimize(graph); + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(MovePermuteUp())); - // The permute is moved to the top. New permutes for layers with multiple inputs + // The permute is moved to the top. New permutes for layers with multiple inputs. BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>, @@ -147,12 +313,18 @@ BOOST_AUTO_TEST_CASE(MovePermuteUp) &IsLayerOfType<armnn::AdditionLayer>, &IsLayerOfType<armnn::ActivationLayer>, &IsLayerOfType<armnn::OutputLayer>)); + + std::list<std::string> testRelatedLayers = { permuteLayerName }; + + BOOST_TEST(CheckRelatedLayers<armnn::PermuteLayer>(graph, testRelatedLayers)); } -BOOST_AUTO_TEST_CASE(PermuteAsReshape) +BOOST_AUTO_TEST_CASE(PermuteAsReshapeTest) { armnn::Graph graph; + std::string permuteLayerName = "permute"; + const armnn::TensorInfo infoIn({ 1, 2, 3, 1 }, armnn::DataType::Float32); const armnn::TensorInfo infoOut({ 1, 1, 2, 3 }, armnn::DataType::Float32); @@ -161,9 +333,9 @@ BOOST_AUTO_TEST_CASE(PermuteAsReshape) graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input") ->GetOutputHandler().SetTensorInfo(infoIn); - // Insert permute + // Inserts permute. graph.InsertNewLayer<armnn::PermuteLayer>(output->GetInputSlot(0), - armnn::PermuteDescriptor({ 0, 2, 3, 1 }), "") + armnn::PermuteDescriptor({ 0, 2, 3, 1 }), permuteLayerName.c_str()) ->GetOutputHandler().SetTensorInfo(infoOut); BOOST_TEST(CheckSequence(graph.cbegin(), @@ -172,7 +344,7 @@ BOOST_AUTO_TEST_CASE(PermuteAsReshape) &IsLayerOfType<armnn::PermuteLayer>, &IsLayerOfType<armnn::OutputLayer>)); - armnn::Optimizer::Optimize(graph); + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(PermuteAsReshape())); // The permute is replaced by an equivalent reshape. @@ -189,9 +361,13 @@ BOOST_AUTO_TEST_CASE(PermuteAsReshape) &IsLayerOfType<armnn::InputLayer>, checkReshape, &IsLayerOfType<armnn::OutputLayer>)); + + + std::list<std::string> testRelatedLayers = { permuteLayerName }; + BOOST_TEST(CheckRelatedLayers<armnn::ReshapeLayer>(graph, testRelatedLayers)); } -BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes) +BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapesTest) { armnn::Graph graph; @@ -203,16 +379,19 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes) input->GetOutputHandler().SetTensorInfo(info0); { - // Insert two reshapes + // Inserts two reshapes. const armnn::TensorInfo info1({1, 30, 1, 1}, armnn::DataType::Float32); const armnn::TensorInfo info2({1, 2, 1, 15}, armnn::DataType::Float32); + std::string reshape1Name = "reshape1"; + std::string reshape2Name = "reshape2"; + auto reshape1 = graph.InsertNewLayer<armnn::ReshapeLayer>(output->GetInputSlot(0), armnn::ReshapeDescriptor{ info1.GetShape() }, - "reshape1"); + reshape1Name.c_str()); auto reshape2 = graph.InsertNewLayer<armnn::ReshapeLayer>(output->GetInputSlot(0), armnn::ReshapeDescriptor{ info2.GetShape() }, - "reshape2"); + reshape2Name.c_str()); reshape1->GetOutputHandler().SetTensorInfo(info1); reshape2->GetOutputHandler().SetTensorInfo(info2); @@ -224,7 +403,7 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes) &IsLayerOfType<armnn::ReshapeLayer>, &IsLayerOfType<armnn::OutputLayer>)); - armnn::Optimizer::Optimize(graph); + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeConsecutiveReshapes())); auto checkReshape = [&info2](const armnn::Layer* const layer) -> bool { @@ -234,25 +413,30 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes) (reshapeLayer->GetOutputHandler().GetTensorInfo().GetShape() == info2.GetShape()); }; - // The two reshapes are replaced by a single equivalent reshape + // The two reshapes are replaced by a single equivalent reshape. BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>, checkReshape, &IsLayerOfType<armnn::OutputLayer>)); + + // Check the new reshape layer has the other two reshapes as related layers + std::list<std::string> testRelatedLayers = { reshape2Name, reshape1Name }; + + BOOST_TEST(CheckRelatedLayers<armnn::ReshapeLayer>(graph, testRelatedLayers)); } { - // Insert a reshape to the input shape + // Inserts a reshape to the input shape. auto reshapeToIn = graph.InsertNewLayer<armnn::ReshapeLayer>(output->GetInputSlot(0), armnn::ReshapeDescriptor{ info0.GetShape() }, "reshapeToIn"); reshapeToIn->GetOutputHandler().SetTensorInfo(info0); - armnn::Optimizer::Optimize(graph); + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeConsecutiveReshapes())); - // The two reshapes are removed + // The two reshapes are removed. BOOST_TEST(CheckSequence(graph.cbegin(), graph.cend(), &IsLayerOfType<armnn::InputLayer>, @@ -260,7 +444,7 @@ BOOST_AUTO_TEST_CASE(OptimizeConsecutiveReshapes) } } -BOOST_AUTO_TEST_CASE(SquashEqualSiblings) +BOOST_AUTO_TEST_CASE(SquashEqualSiblingsTest) { armnn::Graph graph; @@ -272,7 +456,7 @@ BOOST_AUTO_TEST_CASE(SquashEqualSiblings) auto input = graph.AddLayer<armnn::InputLayer>(0, "input"); input->GetOutputSlot().SetTensorInfo(info); - // Insert equal permutes, equal reshapes and something else + // Inserts equal permutes, equal reshapes and something else. const armnn::PermuteDescriptor permDesc({ 0, 2, 3, 1 }); const armnn::ReshapeDescriptor reshapeDesc{ { 1, 3, 1, 5 } }; @@ -314,7 +498,8 @@ BOOST_AUTO_TEST_CASE(SquashEqualSiblings) &IsLayerOfType<armnn::OutputLayer>, &IsLayerOfType<armnn::OutputLayer>)); - armnn::Optimizer::Optimize(graph); + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(SquashEqualPermuteSiblings(), + SquashEqualReshapeSiblings())); // The permutes and reshapes are squashed. @@ -331,4 +516,259 @@ BOOST_AUTO_TEST_CASE(SquashEqualSiblings) &IsLayerOfType<armnn::OutputLayer>)); } +BOOST_AUTO_TEST_CASE(ConvertConstantsHalfToFloatTest) +{ + armnn::Graph graph; + + const armnn::TensorInfo info({ 1,1,1,2 }, armnn::DataType::Float32); + + // Create the half precision input data + unsigned int dims[] = { 4,1,1,1 }; + std::vector<float> convWeightsData{1.f, 2.f, 3.f, 4.f}; + std::vector<uint16_t> halfWeights(4); + armnnUtils::FloatingPointConverter::ConvertFloat32To16(convWeightsData.data(), + convWeightsData.size(), + halfWeights.data()); + armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float16), halfWeights); + + //Create the simple test network + auto input = graph.AddLayer<armnn::InputLayer>(0, "input"); + input->GetOutputSlot().SetTensorInfo(info); + + auto fc = graph.AddLayer<armnn::FullyConnectedLayer>(armnn::FullyConnectedDescriptor(), "fc"); + fc->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>(weights); + fc->GetOutputSlot().SetTensorInfo(info); + + auto output = graph.AddLayer<armnn::OutputLayer>(1, "output"); + + //Connect up the layers + input->GetOutputSlot().Connect(fc->GetInputSlot(0)); + fc->GetOutputSlot().Connect(output->GetInputSlot(0)); + + //Test the tensor info is correct. + BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float16); + + // Run the optimizer + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(ConvertConstantsHalfToFloat())); + + //Test the tensor info is correct. + BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float32); + + // Now test the data matches float32 data + float* data = fc->m_Weight->GetTensor<float>(); + BOOST_CHECK(1.0f == data[0]); + BOOST_CHECK(2.0f == data[1]); + BOOST_CHECK(3.0f == data[2]); + BOOST_CHECK(4.0f == data[3]); +} + +BOOST_AUTO_TEST_CASE(ConvertConstantsFloatToHalfTest) +{ + armnn::Graph graph; + + const armnn::TensorInfo info({ 1, 1, 1, 2 }, armnn::DataType::Float16); + + // Create const tensor from fp32 data + unsigned int dims[] = { 4, 1, 1, 1 }; + std::vector<float> floatWeights{ 1.0f, 2.0f, 3.0f, 4.0f }; + armnn::ConstTensor weights(armnn::TensorInfo(4, dims, armnn::DataType::Float32), floatWeights); + + // Create simple test network + auto input = graph.AddLayer<armnn::InputLayer>(0, "input"); + input->GetOutputSlot().SetTensorInfo(info); + + auto fc = graph.AddLayer<armnn::FullyConnectedLayer>(armnn::FullyConnectedDescriptor(), "fc"); + fc->m_Weight = std::make_unique<armnn::ScopedCpuTensorHandle>(weights); + fc->GetOutputSlot().SetTensorInfo(info); + + auto output = graph.AddLayer<armnn::OutputLayer>(1, "output"); + + // Connect up the layers + input->GetOutputSlot().Connect(fc->GetInputSlot(0)); + fc->GetOutputSlot().Connect(output->GetInputSlot(0)); + + // Check tensor data type before conversion + BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float32); + + // Run the optimizer + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(ConvertConstantsFloatToHalf())); + + // Check tensor data type after conversion + BOOST_CHECK(fc->m_Weight->GetTensorInfo().GetDataType() == armnn::DataType::Float16); + + // Check whether data matches expected fp16 data + Half* data = fc->m_Weight->GetTensor<Half>(); + BOOST_CHECK(data[0] == Half(1.0f)); + BOOST_CHECK(data[1] == Half(2.0f)); + BOOST_CHECK(data[2] == Half(3.0f)); + BOOST_CHECK(data[3] == Half(4.0f)); +} + +BOOST_AUTO_TEST_CASE(OptimizeInverseConversionsTest) +{ + armnn::Graph graph; + + auto output = graph.AddLayer<armnn::OutputLayer>(0, "output"); + + graph.InsertNewLayer<armnn::InputLayer>(output->GetInputSlot(0), 0, "input"); + + // Fp32ToFp16 conversion followed by an inverse Fp16ToFp32 conversion + graph.InsertNewLayer<armnn::ConvertFp32ToFp16Layer>(output->GetInputSlot(0), "convert1"); + graph.InsertNewLayer<armnn::ConvertFp16ToFp32Layer>(output->GetInputSlot(0), "convert2"); + + graph.InsertNewLayer<armnn::Convolution2dLayer>(output->GetInputSlot(0), Convolution2dDescriptor(), "conv"); + + // Fp16ToFp32 conversion followed by an inverse Fp32ToFp16 conversion + graph.InsertNewLayer<armnn::ConvertFp16ToFp32Layer>(output->GetInputSlot(0), "convert3"); + graph.InsertNewLayer<armnn::ConvertFp32ToFp16Layer>(output->GetInputSlot(0), "convert4"); + + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>, + &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>, + &IsLayerOfType<armnn::Convolution2dLayer>, + &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>, + &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>, + &IsLayerOfType<armnn::OutputLayer>)); + + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(OptimizeInverseConversionsFp16(), + OptimizeInverseConversionsFp32())); + + // Check that all consecutive inverse conversions are removed + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::Convolution2dLayer>, + &IsLayerOfType<armnn::OutputLayer>)); +} + +BOOST_AUTO_TEST_CASE(InsertConvertersTest) +{ + const armnn::TensorInfo info({ 1, 5, 2, 3 }, armnn::DataType::Float16); + + armnn::Graph graph; + + armnn::LayerBindingId inputId = 0; + + armnn::Layer* head = graph.AddLayer<armnn::OutputLayer>(0, "output"); + + head = graph.InsertNewLayer<armnn::AdditionLayer>(head->GetInputSlot(0), ""); + head->GetOutputHandler().SetTensorInfo(info); + + graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(1), inputId++, "") + ->GetOutputHandler().SetTensorInfo(info); + + head = graph.InsertNewLayer<armnn::FloorLayer>(head->GetInputSlot(0), ""); + head->GetOutputHandler().SetTensorInfo(info); + + head = graph.InsertNewLayer<armnn::MemCopyLayer>(head->GetInputSlot(0), ""); + head->GetOutputHandler().SetTensorInfo(info); + + graph.InsertNewLayer<armnn::InputLayer>(head->GetInputSlot(0), inputId++, "") + ->GetOutputHandler().SetTensorInfo(info); + + // Check graph layer sequence before inserting convert layers + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::MemCopyLayer>, + &IsLayerOfType<armnn::FloorLayer>, + &IsLayerOfType<armnn::AdditionLayer>, + &IsLayerOfType<armnn::OutputLayer>)); + + // Check layers have Float16 DataType + for (auto& layer : graph) + { + if(layer->GetType()==LayerType::Floor || layer->GetType() == LayerType::Addition) + { + BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float16); + BOOST_ASSERT(layer->GetDataType() == DataType::Float16); + } + } + + // Insert convert layers either side of unsupported layer + for (auto& layer : graph) + { + if(layer->GetType()==LayerType::Floor || layer->GetType() == LayerType::Addition) + { + InsertConvertFp16ToFp32LayersBefore(graph, *layer); + InsertConvertFp32ToFp16LayersAfter(graph, *layer); + } + } + + // Check layers have correct DataType after inserting convert layers + for (auto& layer : graph) + { + if (layer->GetType()==LayerType::Floor || layer->GetType() == LayerType::Addition) + { + BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float32); + BOOST_ASSERT(layer->GetDataType() == DataType::Float32); + } + else if (layer->GetType() == LayerType::ConvertFp16ToFp32) + { + BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float32); + BOOST_ASSERT(layer->GetDataType() == DataType::Float16); + } + else if (layer->GetType() == LayerType::ConvertFp32ToFp16) + { + BOOST_ASSERT(layer->GetOutputSlot(0).GetTensorInfo().GetDataType() == DataType::Float16); + BOOST_ASSERT(layer->GetDataType() == DataType::Float32); + } + } + + // Check sequence of layers after inserting convert layers + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>, + &IsLayerOfType<armnn::MemCopyLayer>, + &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>, + &IsLayerOfType<armnn::FloorLayer>, + &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>, + &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>, + &IsLayerOfType<armnn::AdditionLayer>, + &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>, + &IsLayerOfType<armnn::OutputLayer>)); +} + +BOOST_AUTO_TEST_CASE(Fp32NetworkToFp16OptimizationTest) +{ + armnn::Graph graph; + + const armnn::TensorInfo infoFP32({ 2,2,1,3 }, armnn::DataType::Float32); + + // Create the simple test network + auto input = graph.AddLayer<armnn::InputLayer>(0, "input"); + input->GetOutputSlot().SetTensorInfo(infoFP32); + + auto floor = graph.AddLayer<armnn::FloorLayer>("floor"); + floor->GetOutputSlot().SetTensorInfo(infoFP32); + + auto output = graph.AddLayer<armnn::OutputLayer>(1, "output"); + + // Connect up the layers + input->GetOutputSlot().Connect(floor->GetInputSlot(0)); + floor->GetOutputSlot().Connect(output->GetInputSlot(0)); + + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::FloorLayer>, + &IsLayerOfType<armnn::OutputLayer>)); + + // Run the optimizer + armnn::Optimizer::Pass(graph, armnn::MakeOptimizations(Fp32NetworkToFp16Converter())); + + BOOST_TEST(CheckSequence(graph.cbegin(), + graph.cend(), + &IsLayerOfType<armnn::InputLayer>, + &IsLayerOfType<armnn::ConvertFp32ToFp16Layer>, + &IsLayerOfType<armnn::FloorLayer>, + &IsLayerOfType<armnn::ConvertFp16ToFp32Layer>, + &IsLayerOfType<armnn::OutputLayer>)); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/ProfilerTests.cpp b/src/armnn/test/ProfilerTests.cpp new file mode 100644 index 0000000000..4450c5a08e --- /dev/null +++ b/src/armnn/test/ProfilerTests.cpp @@ -0,0 +1,235 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// + +#include <boost/test/unit_test.hpp> +#include <boost/test/output_test_stream.hpp> +#include <boost/algorithm/string.hpp> + +#include <memory> +#include <thread> + +#include <armnn/TypesUtils.hpp> +#include <Profiling.hpp> + +namespace armnn +{ + +size_t GetProfilerEventSequenceSize(armnn::Profiler* profiler) +{ + if (!profiler) + { + return static_cast<size_t>(-1); + } + + return profiler->m_EventSequence.size(); +} +} // namespace armnn + +namespace +{ + +void RegisterUnregisterProfilerSingleThreadImpl() +{ + // Important! Regular assertions must be used in this function for testing (rather than + // BOOST_TEST macros) otherwise multi-threading tests would randomly fail. + + // Get a reference to the profiler manager. + armnn::ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + + // Check that there's no profiler registered for this thread. + assert(!profilerManager.GetProfiler()); + + // Create and register a profiler for this thread. + std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>(); + profilerManager.RegisterProfiler(profiler.get()); + + // Check that on a single thread we get the same profiler we registered. + assert(profiler.get() == profilerManager.GetProfiler()); + + // Destroy the profiler. + profiler.reset(); + + // Check that the profiler has been un-registered for this thread. + assert(!profilerManager.GetProfiler()); +} + +} // namespace + +BOOST_AUTO_TEST_SUITE(Profiler) + +BOOST_AUTO_TEST_CASE(EnableDisableProfiling) +{ + std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>(); + + // Check that profiling is disabled by default. + BOOST_TEST(!profiler->IsProfilingEnabled()); + + // Enable profiling. + profiler->EnableProfiling(true); + + // Check that profiling is enabled. + BOOST_TEST(profiler->IsProfilingEnabled()); + + // Disable profiling. + profiler->EnableProfiling(false); + + // Check that profiling is disabled. + BOOST_TEST(!profiler->IsProfilingEnabled()); +} + +BOOST_AUTO_TEST_CASE(RegisterUnregisterProfilerSingleThread) +{ + RegisterUnregisterProfilerSingleThreadImpl(); +} + +BOOST_AUTO_TEST_CASE(RegisterUnregisterProfilerMultipleThreads) +{ + std::thread thread1([]() { RegisterUnregisterProfilerSingleThreadImpl(); }); + std::thread thread2([]() { RegisterUnregisterProfilerSingleThreadImpl(); }); + std::thread thread3([]() { RegisterUnregisterProfilerSingleThreadImpl(); }); + + thread1.join(); + thread2.join(); + thread3.join(); +} + +BOOST_AUTO_TEST_CASE(ProfilingMacros) +{ + // Get a reference to the profiler manager. + armnn::ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + + { // --- No profiler --- + + // Check that there's no profiler registered for this thread. + BOOST_TEST(!profilerManager.GetProfiler()); + + // Test scoped event. + { ARMNN_SCOPED_PROFILING_EVENT(armnn::Compute::CpuAcc, "test"); } + + // Check that we still cannot get a profiler for this thread. + BOOST_TEST(!profilerManager.GetProfiler()); + } + + // Create and register a profiler for this thread. + std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>(); + profilerManager.RegisterProfiler(profiler.get()); + + { // --- Profiler, but profiling disabled --- + + // Get current event sequence size. + size_t eventSequenceSizeBefore = armnn::GetProfilerEventSequenceSize(profiler.get()); + + // Test scoped macro. + { ARMNN_SCOPED_PROFILING_EVENT(armnn::Compute::CpuAcc, "test"); } + + // Check that no profiling event has been added to the sequence. + size_t eventSequenceSizeAfter = armnn::GetProfilerEventSequenceSize(profiler.get()); + BOOST_TEST(eventSequenceSizeBefore == eventSequenceSizeAfter); + } + + // Enable profiling. + profiler->EnableProfiling(true); + + { // --- Profiler, and profiling enabled --- + + // Get current event sequence size. + size_t eventSequenceSizeBefore = armnn::GetProfilerEventSequenceSize(profiler.get()); + + // Test scoped macro. + { ARMNN_SCOPED_PROFILING_EVENT(armnn::Compute::CpuAcc, "test"); } + + // Check that a profiling event has been added to the sequence. + size_t eventSequenceSizeAfter = armnn::GetProfilerEventSequenceSize(profiler.get()); + BOOST_TEST(eventSequenceSizeAfter == eventSequenceSizeBefore + 1); + } + + // Disable profiling here to not print out anything on stdout. + profiler->EnableProfiling(false); +} + +BOOST_AUTO_TEST_CASE(RuntimeLoadNetwork) +{ + // Get a reference to the profiler manager. + armnn::ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance(); + + // Check that there's no profiler registered for this thread. + BOOST_TEST(!profilerManager.GetProfiler()); + + // Build a mock-network and load it into the runtime. + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + armnn::NetworkId networkIdentifier = 1; + armnn::INetworkPtr mockNetwork(armnn::INetwork::Create()); + mockNetwork->AddInputLayer(0, "test layer"); + std::vector<armnn::Compute> backends = { armnn::Compute::CpuRef }; + runtime->LoadNetwork(networkIdentifier, armnn::Optimize(*mockNetwork, backends, runtime->GetDeviceSpec())); + + // Check that now there's a profiler registered for this thread (created and registered by the loading the network). + BOOST_TEST(profilerManager.GetProfiler()); + + // Unload the network. + runtime->UnloadNetwork(networkIdentifier); + + // Check that the profiler has been un-registered for this thread. + BOOST_TEST(!profilerManager.GetProfiler()); +} + +BOOST_AUTO_TEST_CASE(WriteEventResults) +{ + // Get a reference to the profiler manager. + armnn::ProfilerManager& profileManager = armnn::ProfilerManager::GetInstance(); + + // Create and register a profiler for this thread. + std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>(); + profileManager.RegisterProfiler(profiler.get()); + + // Enable profiling. + profiler->EnableProfiling(true); + + { // --- Profiler, and profiling enabled --- + + // Get current event sequence size. + size_t eventSequenceSizeBefore = armnn::GetProfilerEventSequenceSize(profiler.get()); + + // Test scoped macro. + { + // Need to directly create a ScopedProfilingEvent as the one created by the macro falls out of scope + // immediately causing the Event.Stop() function method to be called immediately after the Event.Start() + // function resulting in periodic test failures on the Dent and Smith HiKeys + armnn::ScopedProfilingEvent testEvent(armnn::Compute::CpuAcc, "test", armnn::WallClockTimer()); + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + } + + // Check that a profiling event has been added to the sequence. + size_t eventSequenceSizeAfter = armnn::GetProfilerEventSequenceSize(profiler.get()); + BOOST_TEST(eventSequenceSizeAfter == eventSequenceSizeBefore + 1); + + boost::test_tools::output_test_stream output; + profiler->AnalyzeEventsAndWriteResults(output); + BOOST_TEST(!output.is_empty(false)); + + // output should contain event name 'test' + BOOST_CHECK(boost::contains(output.str(), "test")); + + // output should contain headers + BOOST_CHECK(boost::contains(output.str(), "Event Sequence - Name")); + BOOST_CHECK(boost::contains(output.str(), "Event Stats - Name")); + BOOST_CHECK(boost::contains(output.str(), "Total")); + BOOST_CHECK(boost::contains(output.str(), "Device")); + // output should contain compute device 'CpuAcc' + BOOST_CHECK(boost::contains(output.str(), "CpuAcc")); + // output should not contain un-readable numbers + BOOST_CHECK(!(boost::contains(output.str(), "e+"))); + // output should not contain un-readable numbers + BOOST_CHECK(!(boost::contains(output.str(), "+"))); + // output should not contain zero value + BOOST_CHECK(!(boost::contains(output.str(), " 0 "))); + } + + // Disable profiling here to not print out anything on stdout. + profiler->EnableProfiling(false); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/ProfilingEventTest.cpp b/src/armnn/test/ProfilingEventTest.cpp new file mode 100644 index 0000000000..4d0319d456 --- /dev/null +++ b/src/armnn/test/ProfilingEventTest.cpp @@ -0,0 +1,95 @@ +// +// Copyright © 2017 Arm Ltd. All rights reserved. +// See LICENSE file in the project root for full license information. +// +#include <boost/test/unit_test.hpp> + +#include "ProfilingEvent.hpp" +#include "Profiling.hpp" +#include <thread> + +using namespace armnn; + +BOOST_AUTO_TEST_SUITE(ProfilingEvent) + +BOOST_AUTO_TEST_CASE(ProfilingEventTest) +{ + // Get a reference to the profiler manager. + armnn::ProfilerManager& profileManager = armnn::ProfilerManager::GetInstance(); + + const char* eventName = "EventName"; + + Event::Instruments insts1; + insts1.emplace_back(std::make_unique<WallClockTimer>()); + Event testEvent(eventName, + nullptr, + nullptr, + armnn::Compute::Undefined, + std::move(insts1)); + + BOOST_CHECK_EQUAL(testEvent.GetName(), "EventName"); + + // start the timer - outer + testEvent.Start(); + + // wait for 10 milliseconds + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // stop the timer - outer + testEvent.Stop(); + + BOOST_CHECK_GE(testEvent.GetMeasurements().front().m_Value, 10.0); + + // create a sub event with CpuAcc + Event::Instruments insts2; + insts2.emplace_back(std::make_unique<WallClockTimer>()); + Event testEvent2(eventName, + profileManager.GetProfiler(), + &testEvent, + Compute::CpuAcc, + std::move(insts2)); + + BOOST_CHECK_EQUAL(&testEvent, testEvent2.GetParentEvent()); + BOOST_CHECK_EQUAL(profileManager.GetProfiler(), testEvent2.GetProfiler()); + BOOST_CHECK_EQUAL(Compute::CpuAcc, testEvent2.GetComputeDevice()); +} + +BOOST_AUTO_TEST_CASE(ProfilingEventTestOnGpuAcc) +{ + // Get a reference to the profiler manager. + armnn::ProfilerManager& profileManager = armnn::ProfilerManager::GetInstance(); + + const char* eventName = "GPUEvent"; + + Event::Instruments insts1; + insts1.emplace_back(std::make_unique<WallClockTimer>()); + Event testEvent(eventName, + nullptr, + nullptr, + armnn::Compute::Undefined, + std::move(insts1)); + + BOOST_CHECK_EQUAL(testEvent.GetName(), "GPUEvent"); + + // start the timer - outer + testEvent.Start(); + + // wait for 10 milliseconds + std::this_thread::sleep_for(std::chrono::milliseconds(10)); + + // stop the timer - outer + testEvent.Stop(); + + BOOST_CHECK_GE(testEvent.GetMeasurements().front().m_Value, 10.0); + + // create a sub event + Event::Instruments insts2; + insts2.emplace_back(std::make_unique<WallClockTimer>()); + Event testEvent2(eventName, profileManager.GetProfiler(), &testEvent, Compute::GpuAcc, std::move(insts2)); + + BOOST_CHECK_EQUAL(&testEvent, testEvent2.GetParentEvent()); + BOOST_CHECK_EQUAL(profileManager.GetProfiler(), testEvent2.GetProfiler()); + BOOST_CHECK_EQUAL(Compute::GpuAcc, testEvent2.GetComputeDevice()); +} + +BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/RuntimeTests.cpp b/src/armnn/test/RuntimeTests.cpp index fcb0a1e7c2..e29a1d4841 100644 --- a/src/armnn/test/RuntimeTests.cpp +++ b/src/armnn/test/RuntimeTests.cpp @@ -32,33 +32,46 @@ BOOST_AUTO_TEST_SUITE(Runtime) BOOST_AUTO_TEST_CASE(RuntimeUnloadNetwork) { // build 2 mock-networks and load them into the runtime - armnn::IRuntimePtr runtime(armnn::IRuntime::Create(armnn::Compute::CpuRef)); + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); - // mock network 1 + // Mock network 1. armnn::NetworkId networkIdentifier1 = 1; armnn::INetworkPtr mockNetwork1(armnn::INetwork::Create()); mockNetwork1->AddInputLayer(0, "test layer"); - runtime->LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, runtime->GetDeviceSpec())); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + runtime->LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, backends, runtime->GetDeviceSpec())); - // mock network 2 + // Mock network 2. armnn::NetworkId networkIdentifier2 = 2; armnn::INetworkPtr mockNetwork2(armnn::INetwork::Create()); mockNetwork2->AddInputLayer(0, "test layer"); - runtime->LoadNetwork(networkIdentifier2, Optimize(*mockNetwork2, runtime->GetDeviceSpec())); + runtime->LoadNetwork(networkIdentifier2, Optimize(*mockNetwork2, backends, runtime->GetDeviceSpec())); - // unload one by its networkID + // Unloads one by its networkID. BOOST_TEST(runtime->UnloadNetwork(networkIdentifier1) == armnn::Status::Success); BOOST_TEST(runtime->UnloadNetwork(networkIdentifier1) == armnn::Status::Failure); } // Note: the current builds we don't do valgrind and gperftools based leak checking at the same -// time, so in practice WITH_VALGRIND and ARMNN_LEAK_CHECKING_ENABLED are exclusive. In -// the future the gperftools based leak checking should stay and the valgrind based should -// be removed. +// time, so in practice WITH_VALGRIND and ARMNN_LEAK_CHECKING_ENABLED are exclusive. The +// valgrind tests can stay for x86 builds, but on hikey Valgrind is just way too slow +// to be integrated into the CI system. -#if ARMNN_LEAK_CHECKING_ENABLED -void CreateAndDropDummyNetwork(armnn::Runtime & runtime) +#ifdef ARMNN_LEAK_CHECKING_ENABLED + +struct DisableGlobalLeakChecking +{ + DisableGlobalLeakChecking() + { + ARMNN_LOCAL_LEAK_CHECKING_ONLY(); + } +}; + +BOOST_GLOBAL_FIXTURE(DisableGlobalLeakChecking); + +void CreateAndDropDummyNetwork(const std::vector<armnn::Compute>& backends, armnn::Runtime& runtime) { armnn::NetworkId networkIdentifier; { @@ -74,12 +87,12 @@ void CreateAndDropDummyNetwork(armnn::Runtime & runtime) input->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); layer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - // set the tensors in the network + // Sets the tensors in the network. input->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); // optimize the network - armnn::IOptimizedNetworkPtr optNet = Optimize(*network, runtime.GetDeviceSpec()); + armnn::IOptimizedNetworkPtr optNet = Optimize(*network, backends, runtime.GetDeviceSpec()); runtime.LoadNetwork(networkIdentifier, std::move(optNet)); } @@ -94,10 +107,13 @@ BOOST_AUTO_TEST_CASE(RuntimeHeapMemoryUsageSanityChecks) ARMNN_SCOPED_LEAK_CHECKER("Sanity_Check_Outer"); { ARMNN_SCOPED_LEAK_CHECKER("Sanity_Check_Inner"); + BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE() == true); std::unique_ptr<char[]> dummyAllocation(new char[1000]); - BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE() == false); - BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() >= 1000); - BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() >= 1); + BOOST_CHECK_MESSAGE(ARMNN_NO_LEAKS_IN_SCOPE() == false, + "A leak of 1000 bytes is expected here. " + "Please make sure environment variable: HEAPCHECK=draconian is set!"); + BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 1000); + BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 1); } BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE()); BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0); @@ -109,22 +125,24 @@ BOOST_AUTO_TEST_CASE(RuntimeHeapMemoryUsageSanityChecks) BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksGpuAcc) { BOOST_TEST(ARMNN_LEAK_CHECKER_IS_ACTIVE()); - - armnn::Runtime runtime(armnn::Compute::GpuAcc); + armnn::IRuntime::CreationOptions options; + armnn::Runtime runtime(options); armnn::RuntimeLoadedNetworksReserve(&runtime); + std::vector<armnn::Compute> backends = {armnn::Compute::GpuAcc}; { // Do a warmup of this so we make sure that all one-time // initialization happens before we do the leak checking. - CreateAndDropDummyNetwork(runtime); + CreateAndDropDummyNetwork(backends, runtime); } { ARMNN_SCOPED_LEAK_CHECKER("LoadAndUnloadNetworkGpuAcc"); + BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE()); // In the second run we check for all remaining memory // in use after the network was unloaded. If there is any // then it will be treated as a memory leak. - CreateAndDropDummyNetwork(runtime); + CreateAndDropDummyNetwork(backends, runtime); BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE()); BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0); BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 0); @@ -136,22 +154,24 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksGpuAcc) BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksCpuAcc) { BOOST_TEST(ARMNN_LEAK_CHECKER_IS_ACTIVE()); - - armnn::Runtime runtime(armnn::Compute::CpuAcc); + armnn::IRuntime::CreationOptions options; + armnn::Runtime runtime(options); armnn::RuntimeLoadedNetworksReserve(&runtime); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuAcc}; { // Do a warmup of this so we make sure that all one-time // initialization happens before we do the leak checking. - CreateAndDropDummyNetwork(runtime); + CreateAndDropDummyNetwork(backends, runtime); } { ARMNN_SCOPED_LEAK_CHECKER("LoadAndUnloadNetworkCpuAcc"); + BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE()); // In the second run we check for all remaining memory // in use after the network was unloaded. If there is any // then it will be treated as a memory leak. - CreateAndDropDummyNetwork(runtime); + CreateAndDropDummyNetwork(backends, runtime); BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE()); BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0); BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 0); @@ -163,21 +183,24 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeaksCpuRef) { BOOST_TEST(ARMNN_LEAK_CHECKER_IS_ACTIVE()); - armnn::Runtime runtime(armnn::Compute::CpuRef); + armnn::IRuntime::CreationOptions options; + armnn::Runtime runtime(options); armnn::RuntimeLoadedNetworksReserve(&runtime); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; { // Do a warmup of this so we make sure that all one-time // initialization happens before we do the leak checking. - CreateAndDropDummyNetwork(runtime); + CreateAndDropDummyNetwork(backends, runtime); } { ARMNN_SCOPED_LEAK_CHECKER("LoadAndUnloadNetworkCpuRef"); + BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE()); // In the second run we check for all remaining memory // in use after the network was unloaded. If there is any // then it will be treated as a memory leak. - CreateAndDropDummyNetwork(runtime); + CreateAndDropDummyNetwork(backends, runtime); BOOST_TEST(ARMNN_NO_LEAKS_IN_SCOPE()); BOOST_TEST(ARMNN_BYTES_LEAKED_IN_SCOPE() == 0); BOOST_TEST(ARMNN_OBJECTS_LEAKED_IN_SCOPE() == 0); @@ -199,25 +222,28 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage) // A start-pointer or chain of start-pointers to the block is found. Since the block is still pointed at, // the programmer could, at least in principle, have freed it before program exit. - // We want to test this in case memory is not freed as early as it could have been + // We want to test this in case memory is not freed as early as it could have been. unsigned long reachableBefore = 0; unsigned long reachableAfter = 0; - // needed as out params but we don't test them + // Needed as out params but we don't test them. unsigned long dubious = 0; unsigned long suppressed = 0; - // ensure that runtime is large enough before checking for memory leaks - // otherwise when loading the network it will automatically reserve memory that won't be released until destruction + // Ensure that runtime is large enough before checking for memory leaks. + // Otherwise, when loading the network, it will automatically reserve memory that won't be released + // until destruction. armnn::NetworkId networkIdentifier; - armnn::Runtime runtime(armnn::Compute::GpuAcc); + armnn::IRuntime::CreationOptions options; + armnn::Runtime runtime(options); armnn::RuntimeLoadedNetworksReserve(&runtime); - // check for leaks before we load the network and record them so that we can see the delta after unloading + // Checks for leaks before we load the network and record them so that we can see the delta after unloading. VALGRIND_DO_QUICK_LEAK_CHECK; VALGRIND_COUNT_LEAKS(leakedBefore, dubious, reachableBefore, suppressed); // build a mock-network and load it into the runtime + std::vector<armnn::Compute> backends = {armnn::Compute::GpuAcc}; { armnn::TensorInfo inputTensorInfo(armnn::TensorShape({ 7, 7 }), armnn::DataType::Float32); armnn::TensorInfo outputTensorInfo(armnn::TensorShape({ 7, 7 }), armnn::DataType::Float32); @@ -231,12 +257,12 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage) input->GetOutputSlot(0).Connect(layer->GetInputSlot(0)); layer->GetOutputSlot(0).Connect(output->GetInputSlot(0)); - // set the tensors in the network + // Sets the tensors in the network. input->GetOutputSlot(0).SetTensorInfo(inputTensorInfo); layer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo); // optimize the network - armnn::IOptimizedNetworkPtr optNet = Optimize(*mockNetwork, runtime.GetDeviceSpec()); + armnn::IOptimizedNetworkPtr optNet = Optimize(*mockNetwork, backends, runtime.GetDeviceSpec()); runtime.LoadNetwork(networkIdentifier, std::move(optNet)); } @@ -246,16 +272,16 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage) VALGRIND_DO_ADDED_LEAK_CHECK; VALGRIND_COUNT_LEAKS(leakedAfter, dubious, reachableAfter, suppressed); - // if we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass + // If we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass. BOOST_TEST(leakedBefore == leakedAfter); // Add resonable threshold after and before running valgrind with the ACL clear cache function. // TODO Threshold set to 80k until the root cause of the memory leakage is found and fixed. Revert threshold - // value to 1024 when fixed + // value to 1024 when fixed. BOOST_TEST(static_cast<long>(reachableAfter) - static_cast<long>(reachableBefore) < 81920); - // these are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters - // so they are assigned to, but still considered unused, causing a warning + // These are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters + // so they are assigned to, but still considered unused, causing a warning. boost::ignore_unused(dubious); boost::ignore_unused(suppressed); } @@ -263,7 +289,7 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryUsage) // Note: this part of the code is due to be removed when we fully trust the gperftools based results. #ifdef WITH_VALGRIND -// run with the following command to get all the amazing output (in the devenv/build folder) :) +// Run with the following command to get all the amazing output (in the devenv/build folder) :) // valgrind --leak-check=full --show-leak-kinds=all --log-file=Valgrind_Memcheck_Leak_Report.txt armnn/test/UnitTests BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak) { @@ -276,11 +302,11 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak) // A start-pointer or chain of start-pointers to the block is found. Since the block is still pointed at, // the programmer could, at least in principle, have freed it before program exit. - // We want to test this in case memory is not freed as early as it could have been + // We want to test this in case memory is not freed as early as it could have been. unsigned long reachableBefore = 0; unsigned long reachableAfter = 0; - // needed as out params but we don't test them + // Needed as out params but we don't test them. unsigned long dubious = 0; unsigned long suppressed = 0; @@ -288,14 +314,15 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak) // ensure that runtime is large enough before checking for memory leaks // otherwise when loading the network it will automatically reserve memory that won't be released until destruction - armnn::Runtime runtime(armnn::Compute::CpuRef); + armnn::IRuntime::CreationOptions options; + armnn::Runtime runtime(options); armnn::RuntimeLoadedNetworksReserve(&runtime); - // check for leaks before we load the network and record them so that we can see the delta after unloading + // Checks for leaks before we load the network and record them so that we can see the delta after unloading. VALGRIND_DO_QUICK_LEAK_CHECK; VALGRIND_COUNT_LEAKS(leakedBefore, dubious, reachableBefore, suppressed); - // build a mock-network and load it into the runtime + // Builds a mock-network and load it into the runtime. { unsigned int inputShape[] = {1, 7, 1, 1}; armnn::TensorInfo inputTensorInfo(4, inputShape, armnn::DataType::Float32); @@ -303,10 +330,9 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak) std::unique_ptr<armnn::Network> mockNetwork1 = std::make_unique<armnn::Network>(); mockNetwork1->AddInputLayer(0, "test layer"); - armnn::DeviceSpec device; - device.DefaultComputeDevice = armnn::Compute::CpuRef; - runtime.LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, device)); + std::vector<armnn::Compute> backends = {armnn::Compute::CpuRef}; + runtime.LoadNetwork(networkIdentifier1, Optimize(*mockNetwork1, backends, runtime.GetDeviceSpec())); } runtime.UnloadNetwork(networkIdentifier1); @@ -314,7 +340,7 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak) VALGRIND_DO_ADDED_LEAK_CHECK; VALGRIND_COUNT_LEAKS(leakedAfter, dubious, reachableAfter, suppressed); - // if we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass + // If we're not running under Valgrind, these vars will have been initialised to 0, so this will always pass. BOOST_TEST(leakedBefore == leakedAfter); #if defined(ARMCOMPUTECL_ENABLED) @@ -329,11 +355,134 @@ BOOST_AUTO_TEST_CASE(RuntimeMemoryLeak) BOOST_TEST(reachableBefore >= reachableAfter); - // these are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters - // so they are assigned to, but still considered unused, causing a warning + // These are needed because VALGRIND_COUNT_LEAKS is a macro that assigns to the parameters + // so they are assigned to, but still considered unused, causing a warning. boost::ignore_unused(dubious); boost::ignore_unused(suppressed); } #endif +#if ARMCOMPUTENEON_ENABLED +BOOST_AUTO_TEST_CASE(RuntimeValidateCpuAccDeviceSupportLayerNoFallback) +{ + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc }; + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(optNet); + + // Load it into the runtime. It should success. + armnn::NetworkId netId; + BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == armnn::Status::Success); +} +#endif // ARMCOMPUTENEON_ENABLED + +#if ARMCOMPUTECL_ENABLED +BOOST_AUTO_TEST_CASE(RuntimeValidateGpuDeviceSupportLayerNoFallback) +{ + // build up the structure of the network + armnn::INetworkPtr net(armnn::INetwork::Create()); + + armnn::IConnectableLayer* input = net->AddInputLayer(0); + + armnn::IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(armnn::TensorInfo({ 1, 1, 4, 4 }, armnn::DataType::Float32)); + + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + std::vector<armnn::Compute> backends = { armnn::Compute::GpuAcc }; + armnn::IOptimizedNetworkPtr optNet = armnn::Optimize(*net, backends, runtime->GetDeviceSpec()); + BOOST_CHECK(optNet); + + // Load it into the runtime. It should success. + armnn::NetworkId netId; + BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == armnn::Status::Success); +} +#endif // ARMCOMPUTECL_ENABLED + +BOOST_AUTO_TEST_CASE(RuntimeCpuRef) +{ + using namespace armnn; + + // Create runtime in which test will run + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + // build up the structure of the network + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input = net->AddInputLayer(0); + + // This layer configuration isn't supported by CpuAcc, should be fall back to CpuRef. + NormalizationDescriptor descriptor; + IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor); + + IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0)); + normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32)); + normalize->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32)); + + // optimize the network + std::vector<armnn::Compute> backends = { armnn::Compute::CpuRef }; + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + + // Load it into the runtime. It should success. + armnn::NetworkId netId; + BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Success); +} + +BOOST_AUTO_TEST_CASE(RuntimeFallbackToCpuRef) +{ + using namespace armnn; + + // Create runtime in which test will run + armnn::IRuntime::CreationOptions options; + armnn::IRuntimePtr runtime(armnn::IRuntime::Create(options)); + + // build up the structure of the network + INetworkPtr net(INetwork::Create()); + + IConnectableLayer* input = net->AddInputLayer(0); + + // This layer configuration isn't supported by CpuAcc, should be fall back to CpuRef. + NormalizationDescriptor descriptor; + IConnectableLayer* normalize = net->AddNormalizationLayer(descriptor); + + IConnectableLayer* output = net->AddOutputLayer(0); + + input->GetOutputSlot(0).Connect(normalize->GetInputSlot(0)); + normalize->GetOutputSlot(0).Connect(output->GetInputSlot(0)); + + input->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32)); + normalize->GetOutputSlot(0).SetTensorInfo(TensorInfo({ 1, 1, 4, 4 }, DataType::Float32)); + + // Allow fallback to CpuRef. + std::vector<armnn::Compute> backends = { armnn::Compute::CpuAcc, armnn::Compute::CpuRef }; + // optimize the network + IOptimizedNetworkPtr optNet = Optimize(*net, backends, runtime->GetDeviceSpec()); + + // Load it into the runtime. It should succeed. + armnn::NetworkId netId; + BOOST_TEST(runtime->LoadNetwork(netId, std::move(optNet)) == Status::Success); +} + BOOST_AUTO_TEST_SUITE_END() diff --git a/src/armnn/test/TensorHelpers.hpp b/src/armnn/test/TensorHelpers.hpp index aac4c1d15e..ec38940a44 100644 --- a/src/armnn/test/TensorHelpers.hpp +++ b/src/armnn/test/TensorHelpers.hpp @@ -39,7 +39,7 @@ struct SelectiveComparer<T, false> { static bool Compare(T a, T b) { - // if a or b is zero, percent_tolerance does an exact match, so compare to a small, constant tolerance instead + // If a or b is zero, percent_tolerance does an exact match, so compare to a small, constant tolerance instead. if (a == 0.0f || b == 0.0f) { return std::abs(a - b) <= g_FloatCloseToZeroTolerance; @@ -62,7 +62,7 @@ template <typename T, std::size_t n> boost::test_tools::predicate_result CompareTensors(const boost::multi_array<T, n>& a, const boost::multi_array<T, n>& b) { - // check they are same shape + // Checks they are same shape. for (unsigned int i=0; i<n; i++) { if (a.shape()[i] != b.shape()[i]) @@ -77,9 +77,9 @@ boost::test_tools::predicate_result CompareTensors(const boost::multi_array<T, n } } - // now compare element-wise + // Now compares element-wise. - // fun iteration over n dimensions + // Fun iteration over n dimensions. std::array<unsigned int, n> indices; for (unsigned int i = 0; i < n; i++) { @@ -150,7 +150,7 @@ boost::test_tools::predicate_result CompareTensors(const boost::multi_array<T, n } -// Creates a boost::multi_array with shape defined by the given TensorInfo. +// Creates a boost::multi_array with the shape defined by the given TensorInfo. template <typename T, std::size_t n> boost::multi_array<T, n> MakeTensor(const armnn::TensorInfo& tensorInfo) { @@ -164,7 +164,7 @@ boost::multi_array<T, n> MakeTensor(const armnn::TensorInfo& tensorInfo) return boost::multi_array<T, n>(shape); } -// Creates a boost::multi_array with shape defined by the given TensorInfo and contents defined by the given vector. +// Creates a boost::multi_array with the shape defined by the given TensorInfo and contents defined by the given vector. template <typename T, std::size_t n> boost::multi_array<T, n> MakeTensor(const armnn::TensorInfo& tensorInfo, const std::vector<T>& flat) { diff --git a/src/armnn/test/TensorTest.cpp b/src/armnn/test/TensorTest.cpp index 2bb37f4fb8..8057d4dd7a 100644 --- a/src/armnn/test/TensorTest.cpp +++ b/src/armnn/test/TensorTest.cpp @@ -8,7 +8,7 @@ namespace armnn { -// Add unit test framework for interpreting TensorInfo type +// Adds unit test framework for interpreting TensorInfo type. std::ostream& boost_test_print_type(std::ostream& ostr, const TensorInfo& right) { ostr << "TensorInfo[ " @@ -115,7 +115,7 @@ BOOST_AUTO_TEST_CASE(TensorVsConstTensor) armnn::Tensor t(TensorInfo(), &mutableDatum); armnn::ConstTensor ct(TensorInfo(), &immutableDatum); - // Check that both Tensor and ConstTensor can be passed as a ConstTensor + // Checks that both Tensor and ConstTensor can be passed as a ConstTensor. CheckTensor(t); CheckTensor(ct); } @@ -136,9 +136,9 @@ BOOST_AUTO_TEST_CASE(ModifyTensorInfo) BOOST_AUTO_TEST_CASE(TensorShapeOperatorBrackets) { TensorShape shape({0,1,2,3}); - // Check version of operator[] which returns an unsigned int + // Checks version of operator[] which returns an unsigned int. BOOST_TEST(shape[2] == 2); - // Check the version of operator[] which returns a reference + // Checks the version of operator[] which returns a reference. shape[2] = 20; BOOST_TEST(shape[2] == 20); } diff --git a/src/armnn/test/UnitTests.cpp b/src/armnn/test/UnitTests.cpp index 0e2f99583f..203fbfe821 100644 --- a/src/armnn/test/UnitTests.cpp +++ b/src/armnn/test/UnitTests.cpp @@ -44,7 +44,7 @@ class SetupDebugOutput public: SetupDebugOutput() { - // Send the output to both cout (as standard) and the debug output. + // Sends the output to both cout (as standard) and the debug output. m_OutputStream.push(tee(std::cout)); m_OutputStream.push(m_DebugOutputSink); diff --git a/src/armnn/test/UnitTests.hpp b/src/armnn/test/UnitTests.hpp index 9b750b5b33..8d5c7055e7 100644 --- a/src/armnn/test/UnitTests.hpp +++ b/src/armnn/test/UnitTests.hpp @@ -12,7 +12,7 @@ inline void ConfigureLoggingTest() { - // Configure logging for both the ARMNN library and this test program + // Configures logging for both the ARMNN library and this test program. armnn::ConfigureLogging(true, true, armnn::LogSeverity::Fatal); armnnUtils::ConfigureLogging(boost::log::core::get().get(), true, true, armnn::LogSeverity::Fatal); } @@ -43,9 +43,27 @@ void CompareTestResultIfSupported(const std::string& testName, const LayerTestRe } } +template <typename T, std::size_t n> +void CompareTestResultIfSupported(const std::string& testName, const std::vector<LayerTestResult<T, n>>& testResult) +{ + bool testNameIndicatesUnsupported = testName.find("UNSUPPORTED") != std::string::npos; + for (unsigned int i = 0; i < testResult.size(); ++i) + { + BOOST_CHECK_MESSAGE(testNameIndicatesUnsupported != testResult[i].supported, + "The test name does not match the supportedness it is reporting"); + if (testResult[i].supported) + { + BOOST_TEST(CompareTensors(testResult[i].output, testResult[i].outputExpected)); + } + } +} + template<typename FactoryType, typename TFuncPtr, typename... Args> void RunTestFunction(const char* testName, TFuncPtr testFunction, Args... args) { + std::unique_ptr<armnn::Profiler> profiler = std::make_unique<armnn::Profiler>(); + armnn::ProfilerManager::GetInstance().RegisterProfiler(profiler.get()); + FactoryType workloadFactory; auto testResult = (*testFunction)(workloadFactory, args...); CompareTestResultIfSupported(testName, testResult); diff --git a/src/armnn/test/UtilsTests.cpp b/src/armnn/test/UtilsTests.cpp index 11fa51626c..2268aa31e2 100644 --- a/src/armnn/test/UtilsTests.cpp +++ b/src/armnn/test/UtilsTests.cpp @@ -4,10 +4,14 @@ // #include <boost/test/unit_test.hpp> + #include <armnn/Utils.hpp> #include <armnn/Types.hpp> #include <armnn/TypesUtils.hpp> #include <armnn/Descriptors.hpp> +#include <GraphTopologicalSort.hpp> +#include <Graph.hpp> +#include "TypeUtils.hpp" BOOST_AUTO_TEST_SUITE(Utils) @@ -55,4 +59,110 @@ BOOST_AUTO_TEST_CASE(PermuteDescriptorWithDuplicatedMappings) BOOST_CHECK_THROW(armnn::PermuteDescriptor({ 1u, 1u, 0u }), armnn::InvalidArgumentException); } +BOOST_AUTO_TEST_CASE(HalfType) +{ + using namespace half_float::literal; + armnn::Half a = 1.0_h; + + float b = 1.0f; + armnn::Half c(b); + + // Test half type + BOOST_CHECK_EQUAL(a, b); + BOOST_CHECK_EQUAL(sizeof(c), 2); + + // Test half type is floating point type + BOOST_CHECK(std::is_floating_point<armnn::Half>::value); + + // Test utility function returns correct type. + using ResolvedType = armnn::ResolveType<armnn::DataType::Float16>; + constexpr bool isHalfType = std::is_same<armnn::Half, ResolvedType>::value; + BOOST_CHECK(isHalfType); + + armnn::DataType dt = armnn::GetDataType<armnn::Half>(); + BOOST_CHECK(dt == armnn::DataType::Float16); + + //Test utility functions return correct size + BOOST_CHECK(GetDataTypeSize(armnn::DataType::Float16) == 2); + + //Test utility functions return correct name + BOOST_CHECK((GetDataTypeName(armnn::DataType::Float16) == std::string("Float16"))); +} + +BOOST_AUTO_TEST_CASE(GraphTopologicalSortSimpleTest) +{ + std::map<int, std::vector<int>> graph; + + graph[0] = {2}; + graph[1] = {3}; + graph[2] = {4}; + graph[3] = {4}; + graph[4] = {5}; + graph[5] = {}; + + auto getNodeInputs = [graph](int node) -> std::vector<int> + { + return graph.find(node)->second; + }; + + std::vector<int> targetNodes = {0, 1}; + + std::vector<int> output; + bool sortCompleted = armnnUtils::GraphTopologicalSort<int>(targetNodes, getNodeInputs, output); + + BOOST_TEST(sortCompleted); + + std::vector<int> correctResult = {5, 4, 2, 0, 3, 1}; + BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), correctResult.begin(), correctResult.end()); +} + +BOOST_AUTO_TEST_CASE(GraphTopologicalSortVariantTest) +{ + std::map<int, std::vector<int>> graph; + + graph[0] = {2}; + graph[1] = {2}; + graph[2] = {3, 4}; + graph[3] = {5}; + graph[4] = {5}; + graph[5] = {6}; + graph[6] = {}; + + auto getNodeInputs = [graph](int node) -> std::vector<int> + { + return graph.find(node)->second; + }; + + std::vector<int> targetNodes = {0, 1}; + + std::vector<int> output; + bool sortCompleted = armnnUtils::GraphTopologicalSort<int>(targetNodes, getNodeInputs, output); + + BOOST_TEST(sortCompleted); + + std::vector<int> correctResult = {6, 5, 3, 4, 2, 0, 1}; + BOOST_CHECK_EQUAL_COLLECTIONS(output.begin(), output.end(), correctResult.begin(), correctResult.end()); +} + +BOOST_AUTO_TEST_CASE(CyclicalGraphTopologicalSortTest) +{ + std::map<int, std::vector<int>> graph; + + graph[0] = {1}; + graph[1] = {2}; + graph[2] = {0}; + + auto getNodeInputs = [graph](int node) -> std::vector<int> + { + return graph.find(node)->second; + }; + + std::vector<int> targetNodes = {0}; + + std::vector<int> output; + bool sortCompleted = armnnUtils::GraphTopologicalSort<int>(targetNodes, getNodeInputs, output); + + BOOST_TEST(!sortCompleted); +} + BOOST_AUTO_TEST_SUITE_END() |