From c2b99a8783388ec3bd90dfed2e1b6d4f4d4bd1c8 Mon Sep 17 00:00:00 2001 From: Matthew Sloyan Date: Tue, 27 Apr 2021 17:16:12 +0100 Subject: IVGCVSW-5831 Add additional options to Arm NN External Delegate * Added enable-fast-math and number-of-threads options. * Added save-cached-network and cached-network-filepath options. * Added external_delegate python tests for new options. Signed-off-by: Matthew Sloyan Change-Id: I7cf6522a6f895cd71ed8f369d94a5113d78594f9 --- delegate/python/test/test_external_delegate.py | 109 +++++++++++++++++++++++++ delegate/src/armnn_external_delegate.cpp | 53 ++++++++++++ 2 files changed, 162 insertions(+) diff --git a/delegate/python/test/test_external_delegate.py b/delegate/python/test/test_external_delegate.py index 93d373d0a1..f01a2d3928 100644 --- a/delegate/python/test/test_external_delegate.py +++ b/delegate/python/test/test_external_delegate.py @@ -57,6 +57,115 @@ def test_external_delegate_options_gpu_tuning(delegate_dir, test_data_folder, tm # cleanup os.remove(tuning_file) +@pytest.mark.GpuAccTest +def test_external_delegate_options_gpu_cached_network(delegate_dir, test_data_folder, tmp_path): + + binary_file = os.path.join(str(tmp_path), "test_binary.bin") + # cleanup previous test run if necessary + if os.path.exists(binary_file): + os.remove(binary_file) + + # Create blank binary file to write to. + open(binary_file, 'a').close() + assert (os.path.exists(binary_file)) + assert (os.stat(binary_file).st_size == 0) + + # Run inference to save cached network. + armnn_delegate = tflite.load_delegate( + delegate_dir, + options={ + "backends": "GpuAcc", + "save-cached-network": "1", + "cached-network-filepath": binary_file, + "logging-severity": "info"}) + + run_mock_model(armnn_delegate, test_data_folder) + + # destroy delegate and check if file has been saved. + armnn_delegate.__del__() + assert (os.stat(binary_file).st_size != 0) + + # Create second delegate to load in binary file created. + armnn_delegate2 = tflite.load_delegate( + delegate_dir, + options={ + "backends": "GpuAcc", + "cached-network-filepath": binary_file, + "logging-severity": "info"}) + + run_mock_model(armnn_delegate2, test_data_folder) + + # cleanup + os.remove(binary_file) + +@pytest.mark.GpuAccTest +def test_external_delegate_gpu_fastmath(delegate_dir, test_data_folder): + # create armnn delegate with enable-fast-math + # fast-math is only enabled on Conv2d layer, so use conv2d model. + armnn_delegate = tflite.load_delegate(delegate_dir, options = {'backends': 'GpuAcc', + 'enable-fast-math': '1', + "logging-severity": "info"}) + + model_file_name = 'conv2d.tflite' + + inputShape = [ 1, 5, 5, 1 ] + outputShape = [ 1, 3, 3, 1 ] + + inputValues = [ 1, 5, 2, 3, 5, + 8, 7, 3, 6, 3, + 3, 3, 9, 1, 9, + 4, 1, 8, 1, 3, + 6, 8, 1, 9, 2 ] + + expectedResult = [ 28, 38, 29, + 96, 104, 53, + 31, 55, 24 ] + + input = np.array(inputValues, dtype=np.float32).reshape(inputShape) + expected_output = np.array(expectedResult, dtype=np.float32).reshape(outputShape) + + # run the inference + armnn_outputs = run_inference(test_data_folder, model_file_name, [input], [armnn_delegate]) + + # check results + compare_outputs(armnn_outputs, [expected_output]) + +@pytest.mark.CpuAccTest +def test_external_delegate_cpu_options(capfd, delegate_dir, test_data_folder): + # create armnn delegate with enable-fast-math and number-of-threads options + # fast-math is only enabled on Conv2d layer, so use conv2d model. + armnn_delegate = tflite.load_delegate(delegate_dir, options = {'backends': 'CpuAcc', + 'enable-fast-math': '1', + 'number-of-threads': '4', + "logging-severity": "info"}) + + model_file_name = 'conv2d.tflite' + + inputShape = [ 1, 5, 5, 1 ] + outputShape = [ 1, 3, 3, 1 ] + + inputValues = [ 1, 5, 2, 3, 5, + 8, 7, 3, 6, 3, + 3, 3, 9, 1, 9, + 4, 1, 8, 1, 3, + 6, 8, 1, 9, 2 ] + + expectedResult = [ 28, 38, 29, + 96, 104, 53, + 31, 55, 24 ] + + input = np.array(inputValues, dtype=np.float32).reshape(inputShape) + expected_output = np.array(expectedResult, dtype=np.float32).reshape(outputShape) + + # run the inference + armnn_outputs = run_inference(test_data_folder, model_file_name, [input], [armnn_delegate]) + + # check results + compare_outputs(armnn_outputs, [expected_output]) + + captured = capfd.readouterr() + assert 'Set CPPScheduler to Linear mode, with 4 threads to use' in captured.out + def test_external_delegate_options_wrong_logging_level(delegate_dir): with pytest.raises(ValueError): tflite.load_delegate( diff --git a/delegate/src/armnn_external_delegate.cpp b/delegate/src/armnn_external_delegate.cpp index edf46efb98..27eaf64f73 100644 --- a/delegate/src/armnn_external_delegate.cpp +++ b/delegate/src/armnn_external_delegate.cpp @@ -4,6 +4,7 @@ // #include "armnn_delegate.hpp" #include +#include #include #include @@ -54,6 +55,10 @@ std::vector gpu_options {"gpu-tuning-level", * 1,2 and 3 will create a tuning-file, 0 will apply the * tunings from an existing file * + * Option key: "gpu-mlgo-tuning-file" \n + * Possible values: [filenameString] \n + * Description: File name for the MLGO tuning file + * * Option key: "gpu-tuning-file" \n * Possible values: [filenameString] \n * Description: File name for the tuning file. @@ -62,6 +67,28 @@ std::vector gpu_options {"gpu-tuning-level", * Possible values: ["true"/"false"] \n * Description: Enables GPU kernel profiling * + * Option key: "save-cached-network" \n + * Possible values: ["true"/"false"] \n + * Description: Enables saving of the cached network to a file, + * specified with the cached-network-filepath option + * + * Option key: "cached-network-filepath" \n + * Possible values: [filenameString] \n + * Description: If non-empty, the given file will be used to load/save the cached network. + * If save-cached-network is given then the cached network will be saved to the given file. + * To save the cached network a file must already exist. + * If save-cached-network is not given then the cached network will be loaded from the given file. + * This will remove initial compilation time of kernels and speed up the first execution. + * + * Option key: "enable-fast-math" \n + * Possible values: ["true"/"false"] \n + * Description: Enables fast_math options in backends that support it + * + * Option key: "number-of-threads" \n + * Possible values: ["1"-"64"] \n + * Description: Assign the number of threads used by the CpuAcc backend. + * Default is set to 0 (Backend will decide number of threads to use). + * * Option key: "reduce-fp32-to-fp16" \n * Possible values: ["true"/"false"] \n * Description: Reduce Fp32 data to Fp16 for faster processing @@ -140,6 +167,32 @@ TfLiteDelegate* tflite_plugin_create_delegate(char** options_keys, armnn::BackendOptions option("GpuAcc", {{"KernelProfilingEnabled", (*options_values[i] != '0')}}); options.AddBackendOption(option); } + else if (std::string(options_keys[i]) == std::string("save-cached-network")) + { + armnn::BackendOptions option("GpuAcc", {{"SaveCachedNetwork", (*options_values[i] != '0')}}); + optimizerOptions.m_ModelOptions.push_back(option); + } + else if (std::string(options_keys[i]) == std::string("cached-network-filepath")) + { + armnn::BackendOptions option("GpuAcc", {{"CachedNetworkFilePath", std::string(options_values[i])}}); + optimizerOptions.m_ModelOptions.push_back(option); + } + // Process GPU & CPU backend options + else if (std::string(options_keys[i]) == std::string("enable-fast-math")) + { + armnn::BackendOptions modelOptionGpu("GpuAcc", {{"FastMathEnabled", (*options_values[i] != '0')}}); + optimizerOptions.m_ModelOptions.push_back(modelOptionGpu); + + armnn::BackendOptions modelOptionCpu("CpuAcc", {{"FastMathEnabled", (*options_values[i] != '0')}}); + optimizerOptions.m_ModelOptions.push_back(modelOptionCpu); + } + // Process CPU backend options + else if (std::string(options_keys[i]) == std::string("number-of-threads")) + { + unsigned int numberOfThreads = armnn::numeric_cast(atoi(options_values[i])); + armnn::BackendOptions modelOption("CpuAcc", {{"NumberOfThreads", numberOfThreads}}); + optimizerOptions.m_ModelOptions.push_back(modelOption); + } // Process reduce-fp32-to-fp16 option else if (std::string(options_keys[i]) == std::string("reduce-fp32-to-fp16")) { -- cgit v1.2.1