aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorCathal Corbett <cathal.corbett@arm.com>2022-03-21 09:27:08 +0000
committerCathal Corbett <cathal.corbett@arm.com>2022-04-01 12:36:11 +0100
commita3f4fbaf9ce6e30b3d1337bdfbb47b7301f97d1d (patch)
treece8d29eefd17a49fe431eba26e75d23dbde5aa81
parentebbf1d4853afd2f96bca83bfb4c225a78f94ea93 (diff)
downloadarmnn-a3f4fbaf9ce6e30b3d1337bdfbb47b7301f97d1d.tar.gz
IVGCVSW-6732 Tests surrounded in '#if defined(ARMNNREF_ENABLED)' in android-nn-driver do not execute.
* Change to src/backends/cl/workloads/ClLstmFloatWorkload.cpp fix LstmTests_GpuAcc tests. * Change to src/backends/cl/workloads/ClConvertFp16ToFp32Workload.hpp & ClConvertFp32ToFp16Workload.hpp fix MeanTests_GpuAcc and Convolution2DTests_1.1 tests. * Added UnitTests to src/backends/cl/test/ClImportTensorHandleTests.cpp to test import on Convert Layers. !android-nn-driver:7264 Signed-off-by: Cathal Corbett <cathal.corbett@arm.com> Change-Id: I0c46dc4b9c54eca8771ab12ed0302b6224606957
-rw-r--r--include/armnn/INetwork.hpp20
-rw-r--r--src/armnn/LoadedNetwork.cpp2
-rw-r--r--src/armnn/Network.cpp32
-rw-r--r--src/armnn/Network.hpp4
-rw-r--r--src/backends/cl/test/ClImportTensorHandleTests.cpp399
-rw-r--r--src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp6
-rw-r--r--src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp6
-rw-r--r--src/backends/cl/workloads/ClLstmFloatWorkload.cpp2
8 files changed, 459 insertions, 12 deletions
diff --git a/include/armnn/INetwork.hpp b/include/armnn/INetwork.hpp
index d962fe715b..eaec973899 100644
--- a/include/armnn/INetwork.hpp
+++ b/include/armnn/INetwork.hpp
@@ -810,6 +810,11 @@ protected:
const IDeviceSpec& deviceSpec,
const OptimizerOptions& options,
Optional<std::vector<std::string>&> messages);
+ friend IOptimizedNetworkPtr Optimize(const Graph& inGraph,
+ const std::vector<BackendId>& backendPreferences,
+ const IDeviceSpec& deviceSpec,
+ const OptimizerOptions& options,
+ Optional<std::vector<std::string>&> messages);
IOptimizedNetwork(std::unique_ptr<Graph> graph, const ModelOptions& modelOptions);
@@ -830,4 +835,19 @@ IOptimizedNetworkPtr Optimize(const INetwork& network,
const IDeviceSpec& deviceSpec,
const OptimizerOptions& options = OptimizerOptions(),
Optional<std::vector<std::string>&> messages = EmptyOptional());
+
+/// Create an optimized version of the network
+/// @param inGraph Graph to be optimized.
+/// @param backendPreferences The choice of the backend ordered by user preferences.
+/// @param deviceSpec DeviceSpec object as queried from the runtime. See IRuntime::GetDeviceSpec()
+/// @param messages If there are failures or warnings a string describing same will be added to the vector
+/// @param options OptimizerOptions object with optimizer configuration options
+/// @return An IOptimizedNetworkPtr interface to the optimized network, throws an exception derived from
+/// armnn::Exception if process fails.
+
+IOptimizedNetworkPtr Optimize(const Graph& inGraph,
+ const std::vector<BackendId>& backendPreferences,
+ const IDeviceSpec& deviceSpec,
+ const OptimizerOptions& options,
+ Optional<std::vector<std::string>&> messages = EmptyOptional());
} //namespace armnn
diff --git a/src/armnn/LoadedNetwork.cpp b/src/armnn/LoadedNetwork.cpp
index 6d08fc321e..0e577354f0 100644
--- a/src/armnn/LoadedNetwork.cpp
+++ b/src/armnn/LoadedNetwork.cpp
@@ -1456,7 +1456,7 @@ std::vector<ImportedOutputId> LoadedNetwork::ImportOutputs(const OutputTensors&
{
throw MemoryImportException("ImportOutputs: Force Import failed, incorrect number of tensors");
}
- std::vector<ImportedInputId> importedOutputs;
+ std::vector<ImportedOutputId> importedOutputs;
Graph& graph = m_OptimizedNetwork->pOptimizedNetworkImpl->GetGraph().TopologicalSort();
unsigned int outputIndex = 0;
diff --git a/src/armnn/Network.cpp b/src/armnn/Network.cpp
index 339da0d1b8..a3655509fb 100644
--- a/src/armnn/Network.cpp
+++ b/src/armnn/Network.cpp
@@ -1658,7 +1658,7 @@ OptimizationResult SelectTensorHandleStrategy(Graph& optGraph,
return result;
}
-IOptimizedNetworkPtr Optimize(const INetwork& inNetwork,
+IOptimizedNetworkPtr Optimize(const Graph& inGraph,
const std::vector<BackendId>& backendPreferences,
const IDeviceSpec& deviceSpec,
const OptimizerOptions& options,
@@ -1667,7 +1667,7 @@ IOptimizedNetworkPtr Optimize(const INetwork& inNetwork,
ARMNN_LOG(debug) << options.ToString();
// Enable profiling
- auto profiler = inNetwork.pNetworkImpl->GetGraph().GetProfiler();
+ auto profiler = inGraph.GetProfiler();
ProfilerManager::GetInstance().RegisterProfiler(profiler.get());
profiler->EnableProfiling(options.m_ProfilingEnabled);
@@ -1683,9 +1683,9 @@ IOptimizedNetworkPtr Optimize(const INetwork& inNetwork,
}
// Ensure TensorInfo is set on all output slots of ConstantLayers in the graph
- inNetwork.pNetworkImpl->GetGraph().VerifyConstantLayerSetTensorInfo();
+ inGraph.VerifyConstantLayerSetTensorInfo();
- std::unique_ptr<Graph> graph = std::make_unique<Graph>(inNetwork.pNetworkImpl->GetGraph());
+ std::unique_ptr<Graph> graph = std::make_unique<Graph>(inGraph);
auto optNet = IOptimizedNetworkPtr(new IOptimizedNetwork(std::move(graph), options.m_ModelOptions),
&IOptimizedNetwork::Destroy);
@@ -1827,6 +1827,20 @@ IOptimizedNetworkPtr Optimize(const INetwork& inNetwork,
}
return optNet;
}
+
+IOptimizedNetworkPtr Optimize(const INetwork& inNetwork,
+ const std::vector<BackendId>& backendPreferences,
+ const IDeviceSpec& deviceSpec,
+ const OptimizerOptions& options,
+ Optional<std::vector<std::string>&> messages)
+{
+ return Optimize(inNetwork.pNetworkImpl->GetGraph(),
+ backendPreferences,
+ deviceSpec,
+ options,
+ messages);
+}
+
bool NetworkImpl::GetShapeInferenceMethod()
{
if (m_NetworkOptions.size() > 0 && m_NetworkOptions[0].GetBackendId().Get() == "ShapeInferenceMethod")
@@ -2000,6 +2014,16 @@ IConnectableLayer* NetworkImpl::AddConvolution2dLayerImpl(const Convolution2dDes
return layer;
}
+IConnectableLayer* NetworkImpl::AddConvertFp16ToFp32Layer(const char* name)
+{
+ return m_Graph->AddLayer<ConvertFp16ToFp32Layer>(name);
+}
+
+IConnectableLayer* NetworkImpl::AddConvertFp32ToFp16Layer(const char* name)
+{
+ return m_Graph->AddLayer<ConvertFp32ToFp16Layer>(name);
+}
+
IConnectableLayer* NetworkImpl::AddConvolution2dLayer(const Convolution2dDescriptor& convolution2dDescriptor,
const ConstTensor& weights,
const Optional<ConstTensor>& biases,
diff --git a/src/armnn/Network.hpp b/src/armnn/Network.hpp
index 1d88f267d1..fffad86b80 100644
--- a/src/armnn/Network.hpp
+++ b/src/armnn/Network.hpp
@@ -238,6 +238,10 @@ public:
const LstmInputParams& params,
const char* name = nullptr);
+ IConnectableLayer* AddConvertFp16ToFp32Layer(const char* name = nullptr);
+
+ IConnectableLayer* AddConvertFp32ToFp16Layer(const char* name = nullptr);
+
ARMNN_NO_DEPRECATE_WARN_BEGIN
void Accept(ILayerVisitor& visitor) const;
ARMNN_NO_DEPRECATE_WARN_END
diff --git a/src/backends/cl/test/ClImportTensorHandleTests.cpp b/src/backends/cl/test/ClImportTensorHandleTests.cpp
index 5fac9d13b1..e10e81ac26 100644
--- a/src/backends/cl/test/ClImportTensorHandleTests.cpp
+++ b/src/backends/cl/test/ClImportTensorHandleTests.cpp
@@ -14,6 +14,7 @@
#include <armnn/IRuntime.hpp>
#include <armnn/INetwork.hpp>
+#include "Network.hpp"
using namespace armnn;
@@ -427,6 +428,404 @@ TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConv2dEndToEnd")
CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
}
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp16toFp32EndToEnd")
+{
+ using namespace half_float::literal;
+
+ // Create runtime in which test will run
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+ // build up the structure of the network
+ NetworkImpl network;
+
+ armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+ armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+
+ std::vector<float> expectedOutput =
+ {
+ -37.5f, -15.2f, -8.76f, -2.0f, -1.5f, -1.3f, -0.5f, -0.4f, 0.0f,
+ 1.0f, 0.4f, 0.5f, 1.3f, 1.5f, 2.0f, 8.76f, 15.2f, 37.5f
+ };
+
+ unsigned int numElements = inputInfo.GetNumElements();
+ size_t totalBytesInput = numElements * sizeof(Half);
+ size_t totalBytesOutput = numElements * sizeof(float);
+
+ IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
+ ARMNN_ASSERT(inputLayer);
+
+ armnn::IConnectableLayer* const convLayer = network.AddConvertFp16ToFp32Layer("convert");
+ ARMNN_ASSERT(convLayer);
+
+ inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
+ inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+ IConnectableLayer* output = network.AddOutputLayer(0, "output");
+ convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+ convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+ // Optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = false;
+ std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+ IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
+ CHECK(optNet);
+
+ // Loads it into the runtime.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ // Enable Importing
+ INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ const size_t alignment =
+ arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+ size_t spaceInput = totalBytesInput + alignment + alignment;
+ size_t spaceOutput = totalBytesOutput + alignment + alignment;
+ auto inputData = std::make_unique<uint8_t[]>(spaceInput);
+ void* alignedInputPtr = inputData.get();
+ CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
+
+ // Input with negative values
+ auto* inputPtr = reinterpret_cast<Half*>(alignedInputPtr);
+ inputPtr[0] = -37.5_h;
+ inputPtr[1] = -15.2_h;
+ inputPtr[2] = -8.76_h;
+ inputPtr[3] = -2.0_h;
+ inputPtr[4] = -1.5_h;
+ inputPtr[5] = -1.3_h;
+ inputPtr[6] = -0.5_h;
+ inputPtr[7] = -0.4_h;
+ inputPtr[8] = 0.0_h;
+ inputPtr[9] = 1.0_h;
+ inputPtr[10] = 0.4_h;
+ inputPtr[11] = 0.5_h;
+ inputPtr[12] = 1.3_h;
+ inputPtr[13] = 1.5_h;
+ inputPtr[14] = 2.0_h;
+ inputPtr[15] = 8.76_h;
+ inputPtr[16] = 15.2_h;
+ inputPtr[17] = 37.5_h;
+
+ auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
+ void* alignedOutputPtr = outputData.get();
+ CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
+ auto* outputPtr = reinterpret_cast<float*>(alignedOutputPtr);
+ std::fill_n(outputPtr, numElements, -10.0f);
+
+ TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
+ inputTensorInfo.SetConstant(true);
+ InputTensors inputTensors
+ {
+ {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
+ };
+ OutputTensors outputTensors
+ {
+ {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ INFO("Run ImportInputs");
+ std::vector<ImportedInputId> importedInputIds =
+ runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
+ std::vector<ImportedOutputId> importedOutputIds =
+ runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Contains Convolution2dWorkload
+ std::size_t found = dump.find("ConvertFp16ToFp32Workload");
+ CHECK(found != std::string::npos);
+
+ // Contains SyncMemGeneric
+ found = dump.find("SyncMemGeneric");
+ CHECK(found != std::string::npos);
+
+ // Does not contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ CHECK(found == std::string::npos);
+
+ runtime->UnloadNetwork(netId);
+
+ // Check output is as expected
+ // Validate result by checking that the output has no negative values
+ auto* outputResult = reinterpret_cast<float*>(alignedOutputPtr);
+ CHECK(outputResult);
+
+ // Check the output is correct
+ for (size_t i = 0; i < numElements; ++i)
+ {
+ DOCTEST_CHECK_MESSAGE(outputResult[i] == doctest::Approx(expectedOutput[i]).epsilon(0.0004),
+ "outputValue[" << i << "]: " << outputResult[i] << " != " << expectedOutput[i]);
+ }
+}
+
+
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportConvertFp32toFp16EndToEnd")
+{
+ using namespace half_float::literal;
+
+ // Create runtime in which test will run
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+ // build up the structure of the network
+ NetworkImpl network;
+
+ armnn::TensorInfo inputInfo({1, 3, 2, 3}, armnn::DataType::Float32);
+ armnn::TensorInfo outputTensorInfo({1, 3, 2, 3}, armnn::DataType::Float16);
+
+ std::vector<Half> expectedOutput =
+ {
+ -37.5_h, -15.2_h, -8.76_h, -2.0_h, -1.5_h, -1.3_h, -0.5_h, -0.4_h, 0.0_h,
+ 1.0_h, 0.4_h, 0.5_h, 1.3_h, 1.5_h, 2.0_h, 8.76_h, 15.2_h, 37.5_h
+ };
+
+ unsigned int numElements = inputInfo.GetNumElements();
+ size_t totalBytesInput = numElements * sizeof(float);
+ size_t totalBytesOutput = numElements * sizeof(Half);
+
+ IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
+ ARMNN_ASSERT(inputLayer);
+
+ armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
+ ARMNN_ASSERT(convLayer);
+
+ inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
+ inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+ IConnectableLayer* output = network.AddOutputLayer(0, "output");
+ convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+ convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+ // Optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = false;
+ std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+ IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
+ CHECK(optNet);
+
+ // Loads it into the runtime.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ // Enable Importing
+ INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ const size_t alignment =
+ arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+ size_t spaceInput = totalBytesInput + alignment + alignment;
+ size_t spaceOutput = totalBytesOutput + alignment + alignment;
+ auto inputData = std::make_unique<uint8_t[]>(spaceInput);
+ void* alignedInputPtr = inputData.get();
+ CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
+
+ // Input with negative values
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ inputPtr[0] = -37.5f;
+ inputPtr[1] = -15.2f;
+ inputPtr[2] = -8.76f;
+ inputPtr[3] = -2.0f;
+ inputPtr[4] = -1.5f;
+ inputPtr[5] = -1.3f;
+ inputPtr[6] = -0.5f;
+ inputPtr[7] = -0.4f;
+ inputPtr[8] = 0.0f;
+ inputPtr[9] = 1.0f;
+ inputPtr[10] = 0.4f;
+ inputPtr[11] = 0.5f;
+ inputPtr[12] = 1.3f;
+ inputPtr[13] = 1.5f;
+ inputPtr[14] = 2.0f;
+ inputPtr[15] = 8.76f;
+ inputPtr[16] = 15.2f;
+ inputPtr[17] = 37.5f;
+
+ auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
+ void* alignedOutputPtr = outputData.get();
+ CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
+ auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
+ std::fill_n(outputPtr, numElements, -10.0f);
+
+ TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
+ inputTensorInfo.SetConstant(true);
+ InputTensors inputTensors
+ {
+ {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
+ };
+ OutputTensors outputTensors
+ {
+ {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ INFO("Run ImportInputs");
+ std::vector<ImportedInputId> importedInputIds =
+ runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
+ std::vector<ImportedOutputId> importedOutputIds =
+ runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Contains Convolution2dWorkload
+ std::size_t found = dump.find("ConvertFp32ToFp16Workload");
+ CHECK(found != std::string::npos);
+
+ // Contains SyncMemGeneric
+ found = dump.find("SyncMemGeneric");
+ CHECK(found != std::string::npos);
+
+ // Does not contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ CHECK(found == std::string::npos);
+
+ runtime->UnloadNetwork(netId);
+
+ // Check output is as expected
+ // Validate result by checking that the output has no negative values
+ auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
+ CHECK(outputResult);
+
+ // Check the output is correct
+ CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
+}
+
+TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportSimpleConvertFp32toFp16EndToEnd")
+{
+ using namespace half_float::literal;
+
+ // Create runtime in which test will run
+ IRuntime::CreationOptions options;
+ IRuntimePtr runtime(armnn::IRuntime::Create(options));
+
+ // build up the structure of the network
+ NetworkImpl network;
+
+ armnn::TensorInfo inputInfo({1}, armnn::DataType::Float32);
+ armnn::TensorInfo outputTensorInfo({1}, armnn::DataType::Float16);
+
+ std::vector<Half> expectedOutput = { 1.0_h };
+
+ unsigned int numElements = inputInfo.GetNumElements();
+ size_t totalBytesInput = numElements * sizeof(float);
+ size_t totalBytesOutput = numElements * sizeof(Half);
+
+ IConnectableLayer* const inputLayer = network.AddInputLayer(0, "input");
+ ARMNN_ASSERT(inputLayer);
+
+ armnn::IConnectableLayer* const convLayer = network.AddConvertFp32ToFp16Layer("convert");
+ ARMNN_ASSERT(convLayer);
+
+ inputLayer->GetOutputSlot(0).Connect(convLayer->GetInputSlot(0));
+ inputLayer->GetOutputSlot(0).SetTensorInfo(inputInfo);
+
+ IConnectableLayer* output = network.AddOutputLayer(0, "output");
+ convLayer->GetOutputSlot(0).Connect(output->GetInputSlot(0));
+ convLayer->GetOutputSlot(0).SetTensorInfo(outputTensorInfo);
+
+ // Optimize the network
+ OptimizerOptions optOptions;
+ optOptions.m_ImportEnabled = false;
+ std::vector<armnn::BackendId> backends = {armnn::Compute::GpuAcc};
+ IOptimizedNetworkPtr optNet = Optimize(network.GetGraph(), backends, runtime->GetDeviceSpec(), optOptions);
+ CHECK(optNet);
+
+ // Loads it into the runtime.
+ NetworkId netId;
+ std::string ignoredErrorMessage;
+ // Enable Importing
+ INetworkProperties networkProperties(false, MemorySource::Undefined, MemorySource::Undefined);
+ runtime->LoadNetwork(netId, std::move(optNet), ignoredErrorMessage, networkProperties);
+
+ // Creates structures for input & output
+ const size_t alignment =
+ arm_compute::CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_GLOBAL_MEM_CACHELINE_SIZE>();
+ size_t spaceInput = totalBytesInput + alignment + alignment;
+ size_t spaceOutput = totalBytesOutput + alignment + alignment;
+ auto inputData = std::make_unique<uint8_t[]>(spaceInput);
+ void* alignedInputPtr = inputData.get();
+ CHECK(std::align(alignment, totalBytesInput, alignedInputPtr, spaceInput));
+
+ // Input with negative values
+ auto* inputPtr = reinterpret_cast<float*>(alignedInputPtr);
+ inputPtr[0] = 1.0f;
+
+ auto outputData = std::make_unique<uint8_t[]>(spaceOutput);
+ void* alignedOutputPtr = outputData.get();
+ CHECK(std::align(alignment, totalBytesOutput, alignedOutputPtr, spaceOutput));
+ auto* outputPtr = reinterpret_cast<Half*>(alignedOutputPtr);
+ std::fill_n(outputPtr, numElements, -10.0f);
+
+ TensorInfo inputTensorInfo = runtime->GetInputTensorInfo(netId, 0);
+ inputTensorInfo.SetConstant(true);
+ InputTensors inputTensors
+ {
+ {0,armnn::ConstTensor(inputTensorInfo, alignedInputPtr)},
+ };
+ OutputTensors outputTensors
+ {
+ {0,armnn::Tensor(runtime->GetOutputTensorInfo(netId, 0), alignedOutputPtr)}
+ };
+
+ runtime->GetProfiler(netId)->EnableProfiling(true);
+
+ INFO("Run ImportInputs");
+ std::vector<ImportedInputId> importedInputIds =
+ runtime->ImportInputs(netId, inputTensors, MemorySource::Malloc);
+ std::vector<ImportedOutputId> importedOutputIds =
+ runtime->ImportOutputs(netId, outputTensors, MemorySource::Malloc);
+
+ // Do the inference
+ runtime->EnqueueWorkload(netId, inputTensors, outputTensors, importedInputIds, importedOutputIds);
+
+ // Retrieve the Profiler.Print() output to get the workload execution
+ ProfilerManager& profilerManager = armnn::ProfilerManager::GetInstance();
+ std::stringstream ss;
+ profilerManager.GetProfiler()->Print(ss);;
+ std::string dump = ss.str();
+
+ // Contains Convolution2dWorkload
+ std::size_t found = dump.find("ConvertFp32ToFp16Workload");
+ CHECK(found != std::string::npos);
+
+ // Contains SyncMemGeneric
+ found = dump.find("SyncMemGeneric");
+ CHECK(found != std::string::npos);
+
+ // Does not contain CopyMemGeneric
+ found = dump.find("CopyMemGeneric");
+ CHECK(found == std::string::npos);
+
+ runtime->UnloadNetwork(netId);
+
+ // Check output is as expected
+ // Validate result by checking that the output has no negative values
+ auto* outputResult = reinterpret_cast<Half*>(alignedOutputPtr);
+ CHECK(outputResult);
+
+ // Check the output is correct
+ CHECK(std::equal(outputResult, outputResult + numElements, expectedOutput.begin(), expectedOutput.end()));
+}
+
TEST_CASE_FIXTURE(ClContextControlFixture, "ClForceImportRepeatedInferencesEndToEndTest")
{
/*
diff --git a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
index 8ccf157aca..4ac1274130 100644
--- a/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp16ToFp32Workload.cpp
@@ -80,8 +80,8 @@ void ClConvertFp16ToFp32Workload::ReplaceInputTensorHandle(ITensorHandle* tensor
// Replace output tensor handle with the given TensorHandle
void ClConvertFp16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
{
- ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
- this->m_Data.m_Inputs[slot] = tensorHandle;
+ ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot];
+ this->m_Data.m_Outputs[slot] = tensorHandle;
try
{
Reconfigure();
@@ -89,7 +89,7 @@ void ClConvertFp16ToFp32Workload::ReplaceOutputTensorHandle(ITensorHandle* tenso
catch(armnn::UnimplementedException& e)
{
// Cannot reconfigure, revert the slot back and throw the exception.
- this->m_Data.m_Inputs[slot] = backupHandle;
+ this->m_Data.m_Outputs[slot] = backupHandle;
throw e;
}
}
diff --git a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
index a44a80c997..307314d784 100644
--- a/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
+++ b/src/backends/cl/workloads/ClConvertFp32ToFp16Workload.cpp
@@ -80,8 +80,8 @@ void ClConvertFp32ToFp16Workload::ReplaceInputTensorHandle(ITensorHandle* tensor
// Replace output tensor handle with the given TensorHandle
void ClConvertFp32ToFp16Workload::ReplaceOutputTensorHandle(ITensorHandle* tensorHandle, unsigned int slot)
{
- ITensorHandle* backupHandle = this->m_Data.m_Inputs[slot];
- this->m_Data.m_Inputs[slot] = tensorHandle;
+ ITensorHandle* backupHandle = this->m_Data.m_Outputs[slot];
+ this->m_Data.m_Outputs[slot] = tensorHandle;
try
{
Reconfigure();
@@ -89,7 +89,7 @@ void ClConvertFp32ToFp16Workload::ReplaceOutputTensorHandle(ITensorHandle* tenso
catch(armnn::UnimplementedException& e)
{
// Cannot reconfigure, revert the slot back and throw the exception.
- this->m_Data.m_Inputs[slot] = backupHandle;
+ this->m_Data.m_Outputs[slot] = backupHandle;
throw e;
}
}
diff --git a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
index e190f33bbc..d20c6fc7b5 100644
--- a/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
+++ b/src/backends/cl/workloads/ClLstmFloatWorkload.cpp
@@ -310,7 +310,7 @@ arm_compute::Status ClLstmFloatWorkloadValidate(const TensorInfo& input, const T
if (paramsInfo.m_ProjectionBias != nullptr)
{
- aclProjectionBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetInputGateBias());
+ aclProjectionBiasInfo = BuildArmComputeTensorInfo(paramsInfo.GetProjectionBias());
}
lstm_params_info.set_projection_params(&aclProjectionWeightsInfo,
paramsInfo.m_ProjectionBias != nullptr ?