diff options
Diffstat (limited to 'applications/inference_process/src/inference_process.cpp')
-rw-r--r-- | applications/inference_process/src/inference_process.cpp | 393 |
1 files changed, 393 insertions, 0 deletions
diff --git a/applications/inference_process/src/inference_process.cpp b/applications/inference_process/src/inference_process.cpp new file mode 100644 index 0000000..b5ed5c4 --- /dev/null +++ b/applications/inference_process/src/inference_process.cpp @@ -0,0 +1,393 @@ +/* + * Copyright (c) 2019-2020 Arm Limited. All rights reserved. + * + * SPDX-License-Identifier: Apache-2.0 + * + * Licensed under the Apache License, Version 2.0 (the License); you may + * not use this file except in compliance with the License. + * You may obtain a copy of the License at + * + * www.apache.org/licenses/LICENSE-2.0 + * + * Unless required by applicable law or agreed to in writing, software + * distributed under the License is distributed on an AS IS BASIS, WITHOUT + * WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. + * See the License for the specific language governing permissions and + * limitations under the License. + */ + +#include "tensorflow/lite/micro/all_ops_resolver.h" +#include "tensorflow/lite/micro/cortex_m_generic/debug_log_callback.h" +#include "tensorflow/lite/micro/micro_error_reporter.h" +#include "tensorflow/lite/micro/micro_interpreter.h" +#include "tensorflow/lite/micro/micro_profiler.h" +#include "tensorflow/lite/schema/schema_generated.h" +#include "tensorflow/lite/version.h" + +#include "inference_process.hpp" + +#include "cmsis_compiler.h" + +#include <inttypes.h> + +#ifndef TENSOR_ARENA_SIZE +#define TENSOR_ARENA_SIZE (1024) +#endif + +using namespace std; + +__attribute__((section(".bss.NoInit"), aligned(16))) uint8_t inferenceProcessTensorArena[TENSOR_ARENA_SIZE]; + +namespace { + +void tflu_debug_log(const char *s) { + fprintf(stderr, "%s", s); +} + +void print_output_data(TfLiteTensor *output, size_t bytesToPrint) { + const int numBytesToPrint = min(output->bytes, bytesToPrint); + + int dims_size = output->dims->size; + printf("{\n"); + printf("\"dims\": [%d,", dims_size); + for (int i = 0; i < output->dims->size - 1; ++i) { + printf("%d,", output->dims->data[i]); + } + printf("%d],\n", output->dims->data[dims_size - 1]); + + printf("\"data_address\": \"%08" PRIx32 "\",\n", (uint32_t)output->data.data); + printf("\"data\":\""); + for (int i = 0; i < numBytesToPrint - 1; ++i) { + if (i % 16 == 0 && i != 0) { + printf("\n"); + } + printf("0x%02x,", output->data.uint8[i]); + } + printf("0x%02x\"\n", output->data.uint8[numBytesToPrint - 1]); + printf("}"); +} + +bool copyOutput(const TfLiteTensor &src, InferenceProcess::DataPtr &dst) { + if (dst.data == nullptr) { + return false; + } + + if (src.bytes > dst.size) { + printf("Tensor size %d does not match output size %d.\n", src.bytes, dst.size); + return true; + } + + copy(src.data.uint8, src.data.uint8 + src.bytes, static_cast<uint8_t *>(dst.data)); + dst.size = src.bytes; + + return false; +} + +} // namespace + +namespace InferenceProcess { +DataPtr::DataPtr(void *_data, size_t _size) : data(_data), size(_size) {} + +void DataPtr::invalidate() { +#if defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) + SCB_InvalidateDCache_by_Addr(reinterpret_cast<uint32_t *>(data), size); +#endif +} + +void DataPtr::clean() { +#if defined(__DCACHE_PRESENT) && (__DCACHE_PRESENT == 1U) + SCB_CleanDCache_by_Addr(reinterpret_cast<uint32_t *>(data), size); +#endif +} + +InferenceJob::InferenceJob() : numBytesToPrint(0) {} + +InferenceJob::InferenceJob(const string &_name, + const DataPtr &_networkModel, + const vector<DataPtr> &_input, + const vector<DataPtr> &_output, + const vector<DataPtr> &_expectedOutput, + size_t _numBytesToPrint, + const vector<uint8_t> &_pmuEventConfig, + const uint32_t _pmuCycleCounterEnable) : + name(_name), + networkModel(_networkModel), input(_input), output(_output), expectedOutput(_expectedOutput), + numBytesToPrint(_numBytesToPrint), pmuEventConfig(_pmuEventConfig), pmuCycleCounterEnable(_pmuCycleCounterEnable), + pmuEventCount(), pmuCycleCounterCount(0) { +#if defined(INFERENCE_PROC_TFLU_PROFILER) && defined(ETHOSU) + pmuEventCount = vector<uint32_t>(ETHOSU_PMU_NCOUNTERS, 0); +#endif +} + +void InferenceJob::invalidate() { + networkModel.invalidate(); + + for (auto &it : input) { + it.invalidate(); + } + + for (auto &it : output) { + it.invalidate(); + } + + for (auto &it : expectedOutput) { + it.invalidate(); + } +} + +void InferenceJob::clean() { + networkModel.clean(); + + for (auto &it : input) { + it.clean(); + } + + for (auto &it : output) { + it.clean(); + } + + for (auto &it : expectedOutput) { + it.clean(); + } +} + +InferenceProcess::InferenceProcess() : lock(0) {} + +// NOTE: Adding code for get_lock & free_lock with some corrections from +// http://infocenter.arm.com/help/index.jsp?topic=/com.arm.doc.dai0321a/BIHEJCHB.html +// TODO: check correctness? +void InferenceProcess::getLock() { + int status = 0; + + do { + // Wait until lock_var is free + while (__LDREXW(&lock) != 0) + ; + + // Try to set lock_var + status = __STREXW(1, &lock); + } while (status != 0); + + // Do not start any other memory access until memory barrier is completed + __DMB(); +} + +// TODO: check correctness? +void InferenceProcess::freeLock() { + // Ensure memory operations completed before releasing lock + __DMB(); + + lock = 0; +} + +bool InferenceProcess::push(const InferenceJob &job) { + getLock(); + inferenceJobQueue.push(job); + freeLock(); + + return true; +} + +bool InferenceProcess::runJob(InferenceJob &job) { + printf("Running inference job: %s\n", job.name.c_str()); + + // Register debug log callback for profiling + RegisterDebugLogCallback(tflu_debug_log); + + tflite::MicroErrorReporter microErrorReporter; + tflite::ErrorReporter *reporter = µErrorReporter; + + // Get model handle and verify that the version is correct + const tflite::Model *model = ::tflite::GetModel(job.networkModel.data); + if (model->version() != TFLITE_SCHEMA_VERSION) { + printf("Model provided is schema version %" PRIu32 " not equal to supported version %d.\n", + model->version(), + TFLITE_SCHEMA_VERSION); + return true; + } + + // Create the TFL micro interpreter + tflite::AllOpsResolver resolver; + tflite::MicroProfiler profiler(reporter); + +#if defined(INFERENCE_PROC_TFLU_PROFILER) && defined(ETHOSU) + profiler.MonitorEthosuPMUEvents(ethosu_pmu_event_type(job.pmuEventConfig[0]), + ethosu_pmu_event_type(job.pmuEventConfig[1]), + ethosu_pmu_event_type(job.pmuEventConfig[2]), + ethosu_pmu_event_type(job.pmuEventConfig[3])); +#endif + + tflite::MicroInterpreter interpreter( + model, resolver, inferenceProcessTensorArena, TENSOR_ARENA_SIZE, reporter, &profiler); + + // Allocate tensors + TfLiteStatus allocate_status = interpreter.AllocateTensors(); + if (allocate_status != kTfLiteOk) { + printf("AllocateTensors failed for inference job: %s\n", job.name.c_str()); + return true; + } + + // Create a filtered list of non empty input tensors + vector<TfLiteTensor *> inputTensors; + for (size_t i = 0; i < interpreter.inputs_size(); ++i) { + TfLiteTensor *tensor = interpreter.input(i); + + if (tensor->bytes > 0) { + inputTensors.push_back(tensor); + } + } + + if (job.input.size() != inputTensors.size()) { + printf("Number of input buffers does not match number of non empty network tensors. input=%zu, network=%zu\n", + job.input.size(), + inputTensors.size()); + return true; + } + + // Copy input data + for (size_t i = 0; i < inputTensors.size(); ++i) { + const DataPtr &input = job.input[i]; + const TfLiteTensor *tensor = inputTensors[i]; + + if (input.size != tensor->bytes) { + printf("Input size does not match network size. job=%s, index=%zu, input=%zu, network=%u\n", + job.name.c_str(), + i, + input.size, + tensor->bytes); + return true; + } + + copy(static_cast<char *>(input.data), static_cast<char *>(input.data) + input.size, tensor->data.uint8); + } + + // Run the inference + TfLiteStatus invoke_status = interpreter.Invoke(); + if (invoke_status != kTfLiteOk) { + printf("Invoke failed for inference job: %s\n", job.name.c_str()); + return true; + } + + printf("%s : %zu\r\n", "arena_used_bytes", interpreter.arena_used_bytes()); + +#ifdef INFERENCE_PROC_TFLU_PROFILER + printf("Inference runtime: %u cycles\r\n", (unsigned int)profiler.TotalInferenceTime()); + + if (job.pmuCycleCounterEnable != 0) { + job.pmuCycleCounterCount = profiler.TotalInferenceTime(); + } + +#ifdef ETHOSU + for (uint32_t i = 0; i < ETHOSU_PMU_NCOUNTERS; i++) { + job.pmuEventCount[i] = profiler.GetEthosuPMUCounter(i); + } +#endif +#endif + + // Copy output data + if (job.output.size() > 0) { + if (interpreter.outputs_size() != job.output.size()) { + printf("Number of outputs mismatch. job=%zu, network=%u\n", job.output.size(), interpreter.outputs_size()); + return true; + } + + for (unsigned i = 0; i < interpreter.outputs_size(); ++i) { + if (copyOutput(*interpreter.output(i), job.output[i])) { + return true; + } + } + } + + if (job.numBytesToPrint > 0) { + // Print all of the output data, or the first NUM_BYTES_TO_PRINT bytes, + // whichever comes first as well as the output shape. + printf("num_of_outputs: %d\n", interpreter.outputs_size()); + printf("output_begin\n"); + printf("[\n"); + + for (unsigned int i = 0; i < interpreter.outputs_size(); i++) { + TfLiteTensor *output = interpreter.output(i); + print_output_data(output, job.numBytesToPrint); + if (i != interpreter.outputs_size() - 1) { + printf(",\n"); + } + } + + printf("]\n"); + printf("output_end\n"); + } + + if (job.expectedOutput.size() > 0) { + if (job.expectedOutput.size() != interpreter.outputs_size()) { + printf("Expected number of output tensors does not match network. job=%s, expected=%zu, network=%zu\n", + job.name.c_str(), + job.expectedOutput.size(), + interpreter.outputs_size()); + return true; + } + + for (unsigned int i = 0; i < interpreter.outputs_size(); i++) { + const DataPtr &expected = job.expectedOutput[i]; + const TfLiteTensor *output = interpreter.output(i); + + if (expected.size != output->bytes) { + printf( + "Expected tensor size does not match network size. job=%s, index=%u, expected=%zu, network=%zu\n", + job.name.c_str(), + i, + expected.size, + output->bytes); + return true; + } + + for (unsigned int j = 0; j < output->bytes; ++j) { + if (output->data.uint8[j] != static_cast<uint8_t *>(expected.data)[j]) { + printf("Expected tensor size does not match network size. job=%s, index=%u, offset=%u, " + "expected=%02x, network=%02x\n", + job.name.c_str(), + i, + j, + static_cast<uint8_t *>(expected.data)[j], + output->data.uint8[j]); + } + } + } + } + + printf("Finished running job: %s\n", job.name.c_str()); + + return false; +} + +bool InferenceProcess::run(bool exitOnEmpty) { + bool anyJobFailed = false; + + while (true) { + getLock(); + bool empty = inferenceJobQueue.empty(); + freeLock(); + + if (empty) { + if (exitOnEmpty) { + printf("Exit from InferenceProcess::run() on empty job queue!\n"); + break; + } + + continue; + } + + getLock(); + InferenceJob job = inferenceJobQueue.front(); + inferenceJobQueue.pop(); + freeLock(); + + if (runJob(job)) { + anyJobFailed = true; + continue; + } + } + + return anyJobFailed; +} + +} // namespace InferenceProcess |