From c2b99a8783388ec3bd90dfed2e1b6d4f4d4bd1c8 Mon Sep 17 00:00:00 2001
From: Matthew Sloyan <matthew.sloyan@arm.com>
Date: Tue, 27 Apr 2021 17:16:12 +0100
Subject: IVGCVSW-5831 Add additional options to Arm NN External Delegate

 * Added enable-fast-math and number-of-threads options.
 * Added save-cached-network and cached-network-filepath options.
 * Added external_delegate python tests for new options.

Signed-off-by: Matthew Sloyan <matthew.sloyan@arm.com>
Change-Id: I7cf6522a6f895cd71ed8f369d94a5113d78594f9
---
 delegate/python/test/test_external_delegate.py | 109 +++++++++++++++++++++++++
 delegate/src/armnn_external_delegate.cpp       |  53 ++++++++++++
 2 files changed, 162 insertions(+)

diff --git a/delegate/python/test/test_external_delegate.py b/delegate/python/test/test_external_delegate.py
index 93d373d0a1..f01a2d3928 100644
--- a/delegate/python/test/test_external_delegate.py
+++ b/delegate/python/test/test_external_delegate.py
@@ -57,6 +57,115 @@ def test_external_delegate_options_gpu_tuning(delegate_dir, test_data_folder, tm
     # cleanup
     os.remove(tuning_file)
 
+@pytest.mark.GpuAccTest
+def test_external_delegate_options_gpu_cached_network(delegate_dir, test_data_folder, tmp_path):
+
+    binary_file = os.path.join(str(tmp_path), "test_binary.bin")
+    # cleanup previous test run if necessary
+    if os.path.exists(binary_file):
+        os.remove(binary_file)
+
+    # Create blank binary file to write to.
+    open(binary_file, 'a').close()
+    assert (os.path.exists(binary_file))
+    assert (os.stat(binary_file).st_size == 0)
+
+    # Run inference to save cached network.
+    armnn_delegate = tflite.load_delegate(
+        delegate_dir,
+        options={
+            "backends": "GpuAcc",
+            "save-cached-network": "1",
+            "cached-network-filepath": binary_file,
+            "logging-severity": "info"})
+
+    run_mock_model(armnn_delegate, test_data_folder)
+
+    # destroy delegate and check if file has been saved.
+    armnn_delegate.__del__()
+    assert (os.stat(binary_file).st_size != 0)
+
+    # Create second delegate to load in binary file created.
+    armnn_delegate2 = tflite.load_delegate(
+        delegate_dir,
+        options={
+            "backends": "GpuAcc",
+            "cached-network-filepath": binary_file,
+            "logging-severity": "info"})
+
+    run_mock_model(armnn_delegate2, test_data_folder)
+
+    # cleanup
+    os.remove(binary_file)
+
+@pytest.mark.GpuAccTest
+def test_external_delegate_gpu_fastmath(delegate_dir, test_data_folder):
+    # create armnn delegate with enable-fast-math
+    # fast-math is only enabled on Conv2d layer, so use conv2d model.
+    armnn_delegate = tflite.load_delegate(delegate_dir, options = {'backends': 'GpuAcc',
+                                                                   'enable-fast-math': '1',
+                                                                   "logging-severity": "info"})
+
+    model_file_name = 'conv2d.tflite'
+
+    inputShape = [ 1, 5, 5, 1 ]
+    outputShape = [ 1, 3, 3, 1 ]
+
+    inputValues = [ 1, 5, 2, 3, 5,
+                    8, 7, 3, 6, 3,
+                    3, 3, 9, 1, 9,
+                    4, 1, 8, 1, 3,
+                    6, 8, 1, 9, 2 ]
+
+    expectedResult = [ 28, 38, 29,
+                       96, 104, 53,
+                       31, 55, 24 ]
+
+    input = np.array(inputValues, dtype=np.float32).reshape(inputShape)
+    expected_output = np.array(expectedResult, dtype=np.float32).reshape(outputShape)
+
+    # run the inference
+    armnn_outputs = run_inference(test_data_folder, model_file_name, [input], [armnn_delegate])
+
+    # check results
+    compare_outputs(armnn_outputs, [expected_output])
+
+@pytest.mark.CpuAccTest
+def test_external_delegate_cpu_options(capfd, delegate_dir, test_data_folder):
+    # create armnn delegate with enable-fast-math and number-of-threads options
+    # fast-math is only enabled on Conv2d layer, so use conv2d model.
+    armnn_delegate = tflite.load_delegate(delegate_dir, options = {'backends': 'CpuAcc',
+                                                                   'enable-fast-math': '1',
+                                                                   'number-of-threads': '4',
+                                                                   "logging-severity": "info"})
+
+    model_file_name = 'conv2d.tflite'
+
+    inputShape = [ 1, 5, 5, 1 ]
+    outputShape = [ 1, 3, 3, 1 ]
+
+    inputValues = [ 1, 5, 2, 3, 5,
+                    8, 7, 3, 6, 3,
+                    3, 3, 9, 1, 9,
+                    4, 1, 8, 1, 3,
+                    6, 8, 1, 9, 2 ]
+
+    expectedResult = [ 28, 38, 29,
+                       96, 104, 53,
+                       31, 55, 24 ]
+
+    input = np.array(inputValues, dtype=np.float32).reshape(inputShape)
+    expected_output = np.array(expectedResult, dtype=np.float32).reshape(outputShape)
+
+    # run the inference
+    armnn_outputs = run_inference(test_data_folder, model_file_name, [input], [armnn_delegate])
+
+    # check results
+    compare_outputs(armnn_outputs, [expected_output])
+
+    captured = capfd.readouterr()
+    assert 'Set CPPScheduler to Linear mode, with 4 threads to use' in captured.out
+
 def test_external_delegate_options_wrong_logging_level(delegate_dir):
     with pytest.raises(ValueError):
         tflite.load_delegate(
diff --git a/delegate/src/armnn_external_delegate.cpp b/delegate/src/armnn_external_delegate.cpp
index edf46efb98..27eaf64f73 100644
--- a/delegate/src/armnn_external_delegate.cpp
+++ b/delegate/src/armnn_external_delegate.cpp
@@ -4,6 +4,7 @@
 //
 #include "armnn_delegate.hpp"
 #include <armnn/Logging.hpp>
+#include <armnn/utility/NumericCast.hpp>
 
 #include <iostream>
 #include <tensorflow/lite/minimal_logging.h>
@@ -54,6 +55,10 @@ std::vector<std::string> gpu_options {"gpu-tuning-level",
  *                 1,2 and 3 will create a tuning-file, 0 will apply the
  *                 tunings from an existing file
  *
+ *    Option key: "gpu-mlgo-tuning-file" \n
+ *    Possible values: [filenameString] \n
+ *    Description: File name for the MLGO tuning file
+ *
  *    Option key: "gpu-tuning-file" \n
  *    Possible values: [filenameString] \n
  *    Description: File name for the tuning file.
@@ -62,6 +67,28 @@ std::vector<std::string> gpu_options {"gpu-tuning-level",
  *    Possible values: ["true"/"false"] \n
  *    Description: Enables GPU kernel profiling
  *
+ *    Option key: "save-cached-network" \n
+ *    Possible values: ["true"/"false"] \n
+ *    Description: Enables saving of the cached network to a file,
+ *                 specified with the cached-network-filepath option
+ *
+ *    Option key: "cached-network-filepath" \n
+ *    Possible values: [filenameString] \n
+ *    Description: If non-empty, the given file will be used to load/save the cached network.
+ *                 If save-cached-network is given then the cached network will be saved to the given file.
+ *                 To save the cached network a file must already exist.
+ *                 If save-cached-network is not given then the cached network will be loaded from the given file.
+ *                 This will remove initial compilation time of kernels and speed up the first execution.
+ *
+ *    Option key: "enable-fast-math" \n
+ *    Possible values: ["true"/"false"] \n
+ *    Description: Enables fast_math options in backends that support it
+ *
+ *    Option key: "number-of-threads" \n
+ *    Possible values: ["1"-"64"] \n
+ *    Description: Assign the number of threads used by the CpuAcc backend.
+ *                 Default is set to 0 (Backend will decide number of threads to use).
+ *
  *    Option key: "reduce-fp32-to-fp16" \n
  *    Possible values: ["true"/"false"] \n
  *    Description: Reduce Fp32 data to Fp16 for faster processing
@@ -140,6 +167,32 @@ TfLiteDelegate* tflite_plugin_create_delegate(char** options_keys,
                 armnn::BackendOptions option("GpuAcc", {{"KernelProfilingEnabled", (*options_values[i] != '0')}});
                 options.AddBackendOption(option);
             }
+            else if (std::string(options_keys[i]) == std::string("save-cached-network"))
+            {
+                armnn::BackendOptions option("GpuAcc", {{"SaveCachedNetwork", (*options_values[i] != '0')}});
+                optimizerOptions.m_ModelOptions.push_back(option);
+            }
+            else if (std::string(options_keys[i]) == std::string("cached-network-filepath"))
+            {
+                armnn::BackendOptions option("GpuAcc", {{"CachedNetworkFilePath", std::string(options_values[i])}});
+                optimizerOptions.m_ModelOptions.push_back(option);
+            }
+            // Process GPU & CPU backend options
+            else if (std::string(options_keys[i]) == std::string("enable-fast-math"))
+            {
+                armnn::BackendOptions modelOptionGpu("GpuAcc", {{"FastMathEnabled", (*options_values[i] != '0')}});
+                optimizerOptions.m_ModelOptions.push_back(modelOptionGpu);
+
+                armnn::BackendOptions modelOptionCpu("CpuAcc", {{"FastMathEnabled", (*options_values[i] != '0')}});
+                optimizerOptions.m_ModelOptions.push_back(modelOptionCpu);
+            }
+            // Process CPU backend options
+            else if (std::string(options_keys[i]) == std::string("number-of-threads"))
+            {
+                unsigned int numberOfThreads = armnn::numeric_cast<unsigned int>(atoi(options_values[i]));
+                armnn::BackendOptions modelOption("CpuAcc", {{"NumberOfThreads", numberOfThreads}});
+                optimizerOptions.m_ModelOptions.push_back(modelOption);
+            }
             // Process reduce-fp32-to-fp16 option
             else if (std::string(options_keys[i]) == std::string("reduce-fp32-to-fp16"))
             {
-- 
cgit v1.2.1