4 files changed, 67 insertions, 335 deletions
diff --git a/python/pyarmnn/examples/object_detection/requirements.txt b/python/pyarmnn/examples/object_detection/requirements.txt
index 7cc6379eb9..717a536a0e 100644
--- a/python/pyarmnn/examples/object_detection/requirements.txt
+++ b/python/pyarmnn/examples/object_detection/requirements.txt
@@ -1,3 +1,2 @@
-argparse>=1.4.0
-numpy>=1.19.0
-tqdm>=4.47.0
-\ No newline at end of file
+numpy>=1.19.2
+tqdm>=4.47.0
diff --git a/python/pyarmnn/examples/object_detection/run_video_file.py b/python/pyarmnn/examples/object_detection/run_video_file.py
index 4f06eb184d..fc3e214721 100644
--- a/python/pyarmnn/examples/object_detection/run_video_file.py
+++ b/python/pyarmnn/examples/object_detection/run_video_file.py
@@ -7,55 +7,19 @@ bounding boxes and labels around detected objects, and saves the processed video
 """
 
 import os
+import sys
+script_dir = os.path.dirname(__file__)
+sys.path.insert(1, os.path.join(script_dir, '..', 'common'))
+
 import cv2
-import pyarmnn as ann
 from tqdm import tqdm
 from argparse import ArgumentParser
 
 from ssd import ssd_processing, ssd_resize_factor
 from yolo import yolo_processing, yolo_resize_factor
-from utils import create_video_writer, create_network, dict_labels, preprocess, execute_network, draw_bounding_boxes
-
-
-parser = ArgumentParser()
-parser.add_argument('--video_file_path', required=True, type=str,
-                    help='Path to the video file to run object detection on')
-parser.add_argument('--model_file_path', required=True, type=str,
-                    help='Path to the Object Detection model to use')
-parser.add_argument('--model_name', required=True, type=str,
-                    help='The name of the model being used. Accepted options: ssd_mobilenet_v1, yolo_v3_tiny')
-parser.add_argument('--label_path', type=str,
-                    help='Path to the labelset for the provided model file')
-parser.add_argument('--output_video_file_path', type=str,
-                    help='Path to the output video file with detections added in')
-parser.add_argument('--preferred_backends', type=str, nargs='+', default=['CpuAcc', 'CpuRef'],
-                    help='Takes the preferred backends in preference order, separated by whitespace, '
-                         'for example: CpuAcc GpuAcc CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]. '
-                         'Defaults to [CpuAcc, CpuRef]')
-args = parser.parse_args()
-
-
-def init_video(video_path: str, output_path: str):
-    """
-    Creates a video capture object from a video file.
-
-    Args:
-        video_path: User-specified video file path.
-        output_path: Optional path to save the processed video.
-
-    Returns:
-        Video capture object to capture frames, video writer object to write processed
-        frames to file, plus total frame count of video source to iterate through.
-    """
-    if not os.path.exists(video_path):
-        raise FileNotFoundError(f'Video file not found for: {video_path}')
-    video = cv2.VideoCapture(video_path)
-    if not video.isOpened:
-        raise RuntimeError(f'Failed to open video capture from file: {video_path}')
-
-    video_writer = create_video_writer(video, video_path, output_path)
-    iter_frame_count = range(int(video.get(cv2.CAP_PROP_FRAME_COUNT)))
-    return video, video_writer, iter_frame_count
+from utils import dict_labels
+from cv_utils import init_video_file_capture, preprocess, draw_bounding_boxes
+from network_executor import ArmnnNetworkExecutor
 
 
 def get_model_processing(model_name: str, video: cv2.VideoCapture, input_binding_info: tuple):
@@ -72,30 +36,29 @@ def get_model_processing(model_name: str, video: cv2.VideoCapture, input_binding
         Model labels, decoding and processing functions.
     """
     if model_name == 'ssd_mobilenet_v1':
-        labels = os.path.join('ssd_labels.txt')
+        labels = os.path.join(script_dir, 'ssd_labels.txt')
         return labels, ssd_processing, ssd_resize_factor(video)
     elif model_name == 'yolo_v3_tiny':
-        labels = os.path.join('yolo_labels.txt')
+        labels = os.path.join(script_dir, 'yolo_labels.txt')
         return labels, yolo_processing, yolo_resize_factor(video, input_binding_info)
     else:
         raise ValueError(f'{model_name} is not a valid model name')
 
 
 def main(args):
-    video, video_writer, frame_count = init_video(args.video_file_path, args.output_video_file_path)
-    net_id, runtime, input_binding_info, output_binding_info = create_network(args.model_file_path,
-                                                                              args.preferred_backends)
-    output_tensors = ann.make_output_tensors(output_binding_info)
-    labels, process_output, resize_factor = get_model_processing(args.model_name, video, input_binding_info)
-    labels = dict_labels(labels if args.label_path is None else args.label_path)
+    video, video_writer, frame_count = init_video_file_capture(args.video_file_path, args.output_video_file_path)
+
+    executor = ArmnnNetworkExecutor(args.model_file_path, args.preferred_backends)
+    labels, process_output, resize_factor = get_model_processing(args.model_name, video, executor.input_binding_info)
+    labels = dict_labels(labels if args.label_path is None else args.label_path, include_rgb=True)
 
     for _ in tqdm(frame_count, desc='Processing frames'):
         frame_present, frame = video.read()
         if not frame_present:
             continue
-        input_tensors = preprocess(frame, input_binding_info)
-        inference_output = execute_network(input_tensors, output_tensors, runtime, net_id)
-        detections = process_output(inference_output)
+        input_tensors = preprocess(frame, executor.input_binding_info)
+        output_result = executor.run(input_tensors)
+        detections = process_output(output_result)
         draw_bounding_boxes(frame, detections, resize_factor, labels)
         video_writer.write(frame)
     print('Finished processing frames')
@@ -103,4 +66,20 @@ def main(args):
 
 
 if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--video_file_path', required=True, type=str,
+                        help='Path to the video file to run object detection on')
+    parser.add_argument('--model_file_path', required=True, type=str,
+                        help='Path to the Object Detection model to use')
+    parser.add_argument('--model_name', required=True, type=str,
+                        help='The name of the model being used. Accepted options: ssd_mobilenet_v1, yolo_v3_tiny')
+    parser.add_argument('--label_path', type=str,
+                        help='Path to the labelset for the provided model file')
+    parser.add_argument('--output_video_file_path', type=str,
+                        help='Path to the output video file with detections added in')
+    parser.add_argument('--preferred_backends', type=str, nargs='+', default=['CpuAcc', 'CpuRef'],
+                        help='Takes the preferred backends in preference order, separated by whitespace, '
+                             'for example: CpuAcc GpuAcc CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]. '
+                             'Defaults to [CpuAcc, CpuRef]')
+    args = parser.parse_args()
     main(args)
diff --git a/python/pyarmnn/examples/object_detection/run_video_stream.py b/python/pyarmnn/examples/object_detection/run_video_stream.py
index 94dc6c8b13..9a303e8129 100644
--- a/python/pyarmnn/examples/object_detection/run_video_stream.py
+++ b/python/pyarmnn/examples/object_detection/run_video_stream.py
@@ -8,47 +8,18 @@ and displays a window with the latest processed frame.
 """
 
 import os
+import sys
+script_dir = os.path.dirname(__file__)
+sys.path.insert(1, os.path.join(script_dir, '..', 'common'))
+
 import cv2
-import pyarmnn as ann
-from tqdm import tqdm
 from argparse import ArgumentParser
 
 from ssd import ssd_processing, ssd_resize_factor
 from yolo import yolo_processing, yolo_resize_factor
-from utils import create_network, dict_labels, preprocess, execute_network, draw_bounding_boxes
-
-
-parser = ArgumentParser()
-parser.add_argument('--video_source', type=int, default=0,
-                    help='Device index to access video stream. Defaults to primary device camera at index 0')
-parser.add_argument('--model_file_path', required=True, type=str,
-                    help='Path to the Object Detection model to use')
-parser.add_argument('--model_name', required=True, type=str,
-                    help='The name of the model being used. Accepted options: ssd_mobilenet_v1, yolo_v3_tiny')
-parser.add_argument('--label_path', type=str,
-                    help='Path to the labelset for the provided model file')
-parser.add_argument('--preferred_backends', type=str, nargs='+', default=['CpuAcc', 'CpuRef'],
-                    help='Takes the preferred backends in preference order, separated by whitespace, '
-                         'for example: CpuAcc GpuAcc CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]. '
-                         'Defaults to [CpuAcc, CpuRef]')
-args = parser.parse_args()
-
-
-def init_video(video_source: int):
-    """
-    Creates a video capture object from a device.
-
-    Args:
-        video_source: Device index used to read video stream.
-
-    Returns:
-        Video capture object used to capture frames from a video stream.
-    """
-    video = cv2.VideoCapture(video_source)
-    if not video.isOpened:
-        raise RuntimeError(f'Failed to open video capture for device with index: {video_source}')
-    print('Processing video stream. Press \'Esc\' key to exit the demo.')
-    return video
+from utils import dict_labels
+from cv_utils import init_video_stream_capture, preprocess, draw_bounding_boxes
+from network_executor import ArmnnNetworkExecutor
 
 
 def get_model_processing(model_name: str, video: cv2.VideoCapture, input_binding_info: tuple):
@@ -65,31 +36,31 @@ def get_model_processing(model_name: str, video: cv2.VideoCapture, input_binding
         Model labels, decoding and processing functions.
     """
     if model_name == 'ssd_mobilenet_v1':
-        labels = os.path.join('ssd_labels.txt')
+        labels = os.path.join(script_dir, 'ssd_labels.txt')
         return labels, ssd_processing, ssd_resize_factor(video)
     elif model_name == 'yolo_v3_tiny':
-        labels = os.path.join('yolo_labels.txt')
+        labels = os.path.join(script_dir, 'yolo_labels.txt')
         return labels, yolo_processing, yolo_resize_factor(video, input_binding_info)
     else:
         raise ValueError(f'{model_name} is not a valid model name')
 
 
 def main(args):
-    video = init_video(args.video_source)
-    net_id, runtime, input_binding_info, output_binding_info = create_network(args.model_file_path,
-                                                                              args.preferred_backends)
-    output_tensors = ann.make_output_tensors(output_binding_info)
-    labels, process_output, resize_factor = get_model_processing(args.model_name, video, input_binding_info)
-    labels = dict_labels(labels if args.label_path is None else args.label_path)
+    video = init_video_stream_capture(args.video_source)
+    executor = ArmnnNetworkExecutor(args.model_file_path, args.preferred_backends)
+
+    labels, process_output, resize_factor = get_model_processing(args.model_name, video, executor.input_binding_info)
+    labels = dict_labels(labels if args.label_path is None else args.label_path, include_rgb=True)
 
     while True:
         frame_present, frame = video.read()
         frame = cv2.flip(frame, 1)  # Horizontally flip the frame
         if not frame_present:
             raise RuntimeError('Error reading frame from video stream')
-        input_tensors = preprocess(frame, input_binding_info)
-        inference_output = execute_network(input_tensors, output_tensors, runtime, net_id)
-        detections = process_output(inference_output)
+        input_tensors = preprocess(frame, executor.input_binding_info)
+        print("Running inference...")
+        output_result = executor.run(input_tensors)
+        detections = process_output(output_result)
         draw_bounding_boxes(frame, detections, resize_factor, labels)
         cv2.imshow('PyArmNN Object Detection Demo', frame)
         if cv2.waitKey(1) == 27:
@@ -99,4 +70,18 @@ def main(args):
 
 
 if __name__ == '__main__':
+    parser = ArgumentParser()
+    parser.add_argument('--video_source', type=int, default=0,
+                        help='Device index to access video stream. Defaults to primary device camera at index 0')
+    parser.add_argument('--model_file_path', required=True, type=str,
+                        help='Path to the Object Detection model to use')
+    parser.add_argument('--model_name', required=True, type=str,
+                        help='The name of the model being used. Accepted options: ssd_mobilenet_v1, yolo_v3_tiny')
+    parser.add_argument('--label_path', type=str,
+                        help='Path to the labelset for the provided model file')
+    parser.add_argument('--preferred_backends', type=str, nargs='+', default=['CpuAcc', 'CpuRef'],
+                        help='Takes the preferred backends in preference order, separated by whitespace, '
+                             'for example: CpuAcc GpuAcc CpuRef. Accepted options: [CpuAcc, CpuRef, GpuAcc]. '
+                             'Defaults to [CpuAcc, CpuRef]')
+    args = parser.parse_args()
     main(args)
diff --git a/python/pyarmnn/examples/object_detection/utils.py b/python/pyarmnn/examples/object_detection/utils.py
deleted file mode 100644
index 1235bf4fa6..0000000000
--- a/python/pyarmnn/examples/object_detection/utils.py
+++ /dev/null
@@ -1,231 +0,0 @@
-# Copyright © 2020 Arm Ltd and Contributors. All rights reserved.
-# SPDX-License-Identifier: MIT
-
-"""
-This file contains shared functions used in the object detection scripts for
-preprocessing data, preparing the network and postprocessing.
-"""
-
-import os
-import cv2
-import numpy as np
-import pyarmnn as ann
-
-
-def create_video_writer(video: cv2.VideoCapture, video_path: str, output_path: str):
-    """
-    Creates a video writer object to write processed frames to file.
-
-    Args:
-        video: Video capture object, contains information about data source.
-        video_path: User-specified video file path.
-        output_path: Optional path to save the processed video.
-
-    Returns:
-        Video writer object.
-    """
-    _, ext = os.path.splitext(video_path)
-
-    if output_path is not None:
-        assert os.path.isdir(output_path)
-
-    i, filename = 0, os.path.join(output_path if output_path is not None else str(), f'object_detection_demo{ext}')
-    while os.path.exists(filename):
-        i += 1
-        filename = os.path.join(output_path if output_path is not None else str(), f'object_detection_demo({i}){ext}')
-
-    video_writer = cv2.VideoWriter(filename=filename,
-                                   fourcc=cv2.VideoWriter_fourcc(*'mp4v'),
-                                   fps=int(video.get(cv2.CAP_PROP_FPS)),
-                                   frameSize=(int(video.get(cv2.CAP_PROP_FRAME_WIDTH)),
-                                              int(video.get(cv2.CAP_PROP_FRAME_HEIGHT))))
-    return video_writer
-
-
-def create_network(model_file: str, backends: list):
-    """
-    Creates a network based on the model file and a list of backends.
-
-    Args:
-        model_file: User-specified model file.
-        backends: List of backends to optimize network.
-
-    Returns:
-        net_id: Unique ID of the network to run.
-        runtime: Runtime context for executing inference.
-        input_binding_info: Contains essential information about the model input.
-        output_binding_info: Used to map output tensor and its memory.
-    """
-    if not os.path.exists(model_file):
-        raise FileNotFoundError(f'Model file not found for: {model_file}')
-
-    # Determine which parser to create based on model file extension
-    parser = None
-    _, ext = os.path.splitext(model_file)
-    if ext == '.tflite':
-        parser = ann.ITfLiteParser()
-    elif ext == '.pb':
-        parser = ann.ITfParser()
-    elif ext == '.onnx':
-        parser = ann.IOnnxParser()
-    assert (parser is not None)
-    network = parser.CreateNetworkFromBinaryFile(model_file)
-
-    # Specify backends to optimize network
-    preferred_backends = []
-    for b in backends:
-        preferred_backends.append(ann.BackendId(b))
-
-    # Select appropriate device context and optimize the network for that device
-    options = ann.CreationOptions()
-    runtime = ann.IRuntime(options)
-    opt_network, messages = ann.Optimize(network, preferred_backends, runtime.GetDeviceSpec(),
-                                  ann.OptimizerOptions())
-    print(f'Preferred backends: {backends}\n{runtime.GetDeviceSpec()}\n'
-          f'Optimization warnings: {messages}')
-
-    # Load the optimized network onto the Runtime device
-    net_id, _ = runtime.LoadNetwork(opt_network)
-
-    # Get input and output binding information
-    graph_id = parser.GetSubgraphCount() - 1
-    input_names = parser.GetSubgraphInputTensorNames(graph_id)
-    input_binding_info = parser.GetNetworkInputBindingInfo(graph_id, input_names[0])
-    output_names = parser.GetSubgraphOutputTensorNames(graph_id)
-    output_binding_info = []
-    for output_name in output_names:
-        outBindInfo = parser.GetNetworkOutputBindingInfo(graph_id, output_name)
-        output_binding_info.append(outBindInfo)
-    return net_id, runtime, input_binding_info, output_binding_info
-
-
-def dict_labels(labels_file: str):
-    """
-    Creates a labels dictionary from the input labels file.
-
-    Args:
-        labels_file: Default or user-specified file containing the model output labels.
-
-    Returns:
-        A dictionary keyed on the classification index with values corresponding to
-        labels and randomly generated RGB colors.
-    """
-    labels_dict = {}
-    with open(labels_file, 'r') as labels:
-        for index, line in enumerate(labels, 0):
-            labels_dict[index] = line.strip('\n'), tuple(np.random.random(size=3) * 255)
-        return labels_dict
-
-
-def resize_with_aspect_ratio(frame: np.ndarray, input_binding_info: tuple):
-    """
-    Resizes frame while maintaining aspect ratio, padding any empty space.
-
-    Args:
-        frame: Captured frame.
-        input_binding_info: Contains shape of model input layer.
-
-    Returns:
-        Frame resized to the size of model input layer.
-    """
-    aspect_ratio = frame.shape[1] / frame.shape[0]
-    model_height, model_width = list(input_binding_info[1].GetShape())[1:3]
-
-    if aspect_ratio >= 1.0:
-        new_height, new_width = int(model_width / aspect_ratio), model_width
-        b_padding, r_padding = model_height - new_height, 0
-    else:
-        new_height, new_width = model_height, int(model_height * aspect_ratio)
-        b_padding, r_padding = 0, model_width - new_width
-
-    # Resize and pad any empty space
-    frame = cv2.resize(frame, (new_width, new_height), interpolation=cv2.INTER_LINEAR)
-    frame = cv2.copyMakeBorder(frame, top=0, bottom=b_padding, left=0, right=r_padding,
-                               borderType=cv2.BORDER_CONSTANT, value=[0, 0, 0])
-    return frame
-
-
-def preprocess(frame: np.ndarray, input_binding_info: tuple):
-    """
-    Takes a frame, resizes, swaps channels and converts data type to match
-    model input layer. The converted frame is wrapped in a const tensor
-    and bound to the input tensor.
-
-    Args:
-        frame: Captured frame from video.
-        input_binding_info:  Contains shape and data type of model input layer.
-
-    Returns:
-        Input tensor.
-    """
-    # Swap channels and resize frame to model resolution
-    frame = cv2.cvtColor(frame, cv2.COLOR_BGR2RGB)
-    resized_frame = resize_with_aspect_ratio(frame, input_binding_info)
-
-    # Expand dimensions and convert data type to match model input
-    data_type = np.float32 if input_binding_info[1].GetDataType() == ann.DataType_Float32 else np.uint8
-    resized_frame = np.expand_dims(np.asarray(resized_frame, dtype=data_type), axis=0)
-    assert resized_frame.shape == tuple(input_binding_info[1].GetShape())
-
-    input_tensors = ann.make_input_tensors([input_binding_info], [resized_frame])
-    return input_tensors
-
-
-def execute_network(input_tensors: list, output_tensors: list, runtime, net_id: int) -> np.ndarray:
-    """
-    Executes inference for the loaded network.
-
-    Args:
-        input_tensors: The input frame tensor.
-        output_tensors: The output tensor from output node.
-        runtime: Runtime context for executing inference.
-        net_id: Unique ID of the network to run.
-
-    Returns:
-        Inference results as a list of ndarrays.
-    """
-    runtime.EnqueueWorkload(net_id, input_tensors, output_tensors)
-    output = ann.workload_tensors_to_ndarray(output_tensors)
-    return output
-
-
-def draw_bounding_boxes(frame: np.ndarray, detections: list, resize_factor, labels: dict):
-    """
-    Draws bounding boxes around detected objects and adds a label and confidence score.
-
-    Args:
-        frame: The original captured frame from video source.
-        detections: A list of detected objects in the form [class, [box positions], confidence].
-        resize_factor: Resizing factor to scale box coordinates to output frame size.
-        labels: Dictionary of labels and colors keyed on the classification index.
-    """
-    for detection in detections:
-        class_idx, box, confidence = [d for d in detection]
-        label, color = labels[class_idx][0].capitalize(), labels[class_idx][1]
-
-        # Obtain frame size and resized bounding box positions
-        frame_height, frame_width = frame.shape[:2]
-        x_min, y_min, x_max, y_max = [int(position * resize_factor) for position in box]
-
-        # Ensure box stays within the frame
-        x_min, y_min = max(0, x_min), max(0, y_min)
-        x_max, y_max = min(frame_width, x_max), min(frame_height, y_max)
-
-        # Draw bounding box around detected object
-        cv2.rectangle(frame, (x_min, y_min), (x_max, y_max), color, 2)
-
-        # Create label for detected object class
-        label = f'{label} {confidence * 100:.1f}%'
-        label_color = (0, 0, 0) if sum(color)>200 else (255, 255, 255)
-
-        # Make sure label always stays on-screen
-        x_text, y_text = cv2.getTextSize(label, cv2.FONT_HERSHEY_DUPLEX, 1, 1)[0][:2]
-
-        lbl_box_xy_min = (x_min, y_min if y_min<25 else y_min - y_text)
-        lbl_box_xy_max = (x_min + int(0.55 * x_text), y_min + y_text if y_min<25 else y_min)
-        lbl_text_pos = (x_min + 5, y_min + 16 if y_min<25 else y_min - 5)
-
-        # Add label and confidence value
-        cv2.rectangle(frame, lbl_box_xy_min, lbl_box_xy_max, color, -1)
-        cv2.putText(frame, label, lbl_text_pos, cv2.FONT_HERSHEY_DUPLEX, 0.50,
-                    label_color, 1, cv2.LINE_AA)