aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorFredrik Svedberg <fredrik.svedberg@arm.com>2021-04-23 14:36:42 +0200
committerpatrik.gustavsson <patrik.gustavsson@arm.com>2021-05-06 12:52:21 +0000
commitf5c07c45b48717fb6391adb35cb73ca7cd5734c3 (patch)
tree199fc4646a403af01009c8e10aafec0bad1e910e
parenta0b8d5f1bc32092cb85df07fb68e5582d01def32 (diff)
downloadethos-u-vela-f5c07c45b48717fb6391adb35cb73ca7cd5734c3.tar.gz
[MLBEDSW-4254] Improve weight information in summary
Improved weight information showed in summary if --verbose-weights option is used. Signed-off-by: Fredrik Svedberg <fredrik.svedberg@arm.com> Change-Id: Iac142f2a813bf1c05aa9da3f8a384466e2914d06
-rw-r--r--OPTIONS.md8
-rw-r--r--ethosu/vela/api.py5
-rw-r--r--ethosu/vela/compiler_driver.py2
-rw-r--r--ethosu/vela/nn_graph.py4
-rw-r--r--ethosu/vela/stats_writer.py37
-rw-r--r--ethosu/vela/tensor_allocation.py4
-rw-r--r--ethosu/vela/vela.py9
-rw-r--r--ethosu/vela/weight_compressor.py33
8 files changed, 68 insertions, 34 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index 86b05f63..e8207115 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -431,6 +431,14 @@ Verbose operator list.
vela network.tflite --verbose-operators
```
+### Verbose Weights
+
+Verbose weights information.
+
+```bash
+vela network.tflite --verbose-weights
+```
+
## Configuration File
This is used to describe various properties of the Ethos-U embedded system. The
diff --git a/ethosu/vela/api.py b/ethosu/vela/api.py
index f972133d..e91c0bdb 100644
--- a/ethosu/vela/api.py
+++ b/ethosu/vela/api.py
@@ -416,15 +416,16 @@ def npu_encode_weights(
:param ofm_block_depth: the depth of blocks for processing
:param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
:param block_traversal: indicates how these weights are traversed on sub-kernel basis
- :return: a bytearray of compressed weights
+ :return: a bytearray of encoded weights
"""
from .architecture_features import Accelerator
from . import weight_compressor
acc = Accelerator.from_npu_accelerator(accelerator)
- return weight_compressor.encode_weights(
+ encoded_weights, _ = weight_compressor.encode_weights(
acc, weights_volume, dilation_xy, ifm_bitdepth, ofm_block_depth, is_depthwise, block_traversal
)
+ return encoded_weights
def npu_encode_bias(bias: numpy.int64, scale: int, shift: int):
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index a3c01001..26d350ea 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -61,6 +61,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
verbose_high_level_command_stream=False,
verbose_register_command_stream=False,
verbose_operators=False,
+ verbose_weights=False,
show_cpu_operations=False,
tensor_allocator=TensorAllocator.Greedy,
timing=False,
@@ -77,6 +78,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
self.verbose_high_level_command_stream = verbose_high_level_command_stream
self.verbose_register_command_stream = verbose_register_command_stream
self.verbose_operators = verbose_operators
+ self.verbose_weights = verbose_weights
self.show_cpu_operations = show_cpu_operations
self.tensor_allocator = tensor_allocator
self.timing = timing
diff --git a/ethosu/vela/nn_graph.py b/ethosu/vela/nn_graph.py
index 2d4b0c87..677a385a 100644
--- a/ethosu/vela/nn_graph.py
+++ b/ethosu/vela/nn_graph.py
@@ -517,9 +517,9 @@ class Graph:
self.subgraphs = []
self.metadata = []
self.memory_used = {}
- self.weights_compression_ratio = 0
self.total_original_weights = 0
- self.total_compressed_weights = 0
+ self.total_npu_weights = 0
+ self.total_npu_encoded_weights = 0
self.weight_cache = None # See CompressedWeightCache
def get_root_subgraph(self):
diff --git a/ethosu/vela/stats_writer.py b/ethosu/vela/stats_writer.py
index 597fd151..18b8092e 100644
--- a/ethosu/vela/stats_writer.py
+++ b/ethosu/vela/stats_writer.py
@@ -58,7 +58,9 @@ def write_summary_metrics_csv(nng, summary_filename, arch):
"passes_after_fusing",
]
labels += [area.identifier_name() + "_memory_used" for area in mem_areas]
- labels += ["weights_compression_ratio"]
+ labels += ["total_original_weights"]
+ labels += ["total_npu_weights"]
+ labels += ["total_npu_encoded_weights"]
for mem_area in mem_areas:
labels += [
@@ -107,7 +109,9 @@ def write_summary_metrics_csv(nng, summary_filename, arch):
data_items += [midpoint_fps, nng.batch_size, midpoint_inference_time, n_passes, n_cascaded_passes]
data_items += [nng.memory_used.get(mem_area, 0) / 1024.0 for mem_area in mem_areas]
- data_items += [nng.weights_compression_ratio]
+ data_items += [nng.total_original_weights]
+ data_items += [nng.total_npu_weights]
+ data_items += [nng.total_npu_encoded_weights]
for mem_area in mem_areas:
bws = nng.bandwidths[mem_area]
@@ -228,8 +232,8 @@ def print_performance_metrics_for_strat(
num_cascaded_passes,
n_operations=0,
cpu_operations=None,
- weights_compression_ratio=None,
show_cpu_operations=False,
+ weights_data=None,
f=sys.stdout,
):
@@ -327,10 +331,11 @@ def print_performance_metrics_for_strat(
)
print(file=f)
- if weights_compression_ratio:
- print(
- f"Weights Compression Ratio {weights_compression_ratio:12.2f}", file=f,
- )
+ if weights_data:
+ print(f"Original Weights Size {weights_data['original'] / 1024.0:12.2f} KiB", file=f)
+ print(f"NPU Weights Size {weights_data['npu'] / 1024.0:12.2f} KiB", file=f)
+ print(f"NPU Encoded Weights Size {weights_data['npu_encoded'] / 1024.0:12.2f} KiB", file=f)
+ print(file=f)
print(
f"Neural network macs {int(macs):12d} MACs/batch", file=f,
@@ -354,12 +359,21 @@ def print_performance_metrics_for_strat(
print(file=f)
-def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout):
+def print_performance_metrics(nng, arch, show_cpu_operations=False, verbose_weights=False, f=sys.stdout):
n_passes = sum(len(sg.passes) for sg in nng.subgraphs)
n_cascaded_passes = sum(len(sg.cascaded_passes) for sg in nng.subgraphs)
n_operations = sum(len(ps.ops) for sg in nng.subgraphs for ps in sg.passes)
cpu_operations = sum((ps.ops for sg in nng.subgraphs for ps in sg.passes if ps.placement == PassPlacement.Cpu), [])
min_mem_usage = max(sg.min_mem_usage for sg in nng.subgraphs)
+ weights_data = (
+ {
+ "original": nng.total_original_weights,
+ "npu": nng.total_npu_weights,
+ "npu_encoded": nng.total_npu_encoded_weights,
+ }
+ if verbose_weights
+ else None
+ )
return print_performance_metrics_for_strat(
arch,
nng.name,
@@ -373,12 +387,7 @@ def print_performance_metrics(nng, arch, show_cpu_operations=False, f=sys.stdout
n_cascaded_passes,
n_operations,
cpu_operations,
- nng.weights_compression_ratio,
show_cpu_operations,
+ weights_data,
f,
)
-
-
-def write_human_friendly_metrics(nng, arch, filename):
- f = open(filename, "w")
- print_performance_metrics(nng, arch, f=f)
diff --git a/ethosu/vela/tensor_allocation.py b/ethosu/vela/tensor_allocation.py
index 0ad30e5f..7ffc6f3d 100644
--- a/ethosu/vela/tensor_allocation.py
+++ b/ethosu/vela/tensor_allocation.py
@@ -217,9 +217,5 @@ def allocate_tensors(
if sg == nng.get_root_subgraph():
nng.memory_used = sg.memory_used
- try:
- nng.weights_compression_ratio = nng.total_compressed_weights / nng.total_original_weights
- except ZeroDivisionError:
- nng.weights_compression_ratio = 0.0
return True
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index c9551861..aa74ecf3 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -77,7 +77,12 @@ def process(input_name, enable_debug_db, arch, model_reader_options, compiler_op
summary_csv_file = "{0}_summary_{1}.csv".format(output_basename, arch.system_config)
stats_writer.write_summary_metrics_csv(nng, summary_csv_file, arch)
- stats_writer.print_performance_metrics(nng, show_cpu_operations=compiler_options.show_cpu_operations, arch=arch)
+ stats_writer.print_performance_metrics(
+ nng,
+ show_cpu_operations=compiler_options.show_cpu_operations,
+ verbose_weights=compiler_options.verbose_weights,
+ arch=arch,
+ )
output_filename = output_basename + "_vela.tflite"
if input_name.endswith(".tflite"):
@@ -284,6 +289,7 @@ def main(args=None):
"--verbose-register-command-stream", action="store_true", help="Verbose register command stream"
)
parser.add_argument("--verbose-operators", action="store_true", help="Verbose operator list")
+ parser.add_argument("--verbose-weights", action="store_true", help="Verbose weights information")
parser.add_argument(
"--show-cpu-operations", action="store_true", help="Show the operations that fall back to the CPU"
)
@@ -456,6 +462,7 @@ def main(args=None):
verbose_high_level_command_stream=args.verbose_high_level_command_stream,
verbose_register_command_stream=args.verbose_register_command_stream,
verbose_operators=args.verbose_operators,
+ verbose_weights=args.verbose_weights,
show_cpu_operations=args.show_cpu_operations,
tensor_allocator=args.tensor_allocator,
timing=args.timing,
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index bb7cd674..7ce237ca 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -68,7 +68,7 @@ def encode_weights(
:param is_depthwise: a boolean indicating these weights are used for a depthwise traversal
:param block_traversal: indicates how these weights are traversed on sub-kernel basis
- :return: a bytearray of compressed weights
+ :return: a tuple with a bytearray of encoded weights and the size of the unencoded weights
"""
# Check arg types
assert isinstance(accelerator, Accelerator)
@@ -104,7 +104,7 @@ def encode_weights(
dilation=dilation_xy,
)
encoded_stream = encode(raw_stream)
- return encoded_stream
+ return encoded_stream, len(raw_stream)
def encode_bias(bias: np.int64, scale: int, shift: int):
@@ -161,15 +161,23 @@ class CompressedWeightCache:
def __init__(self):
self.cache = {} # maps from WeightCompressionConfig to a tensor clone containing compressed weights
+ def has_tensor_with_same_compression(self, wcc):
+ return self.cache.get(wcc) is not None
+
def get_tensor_with_same_compression(self, wcc):
- return self.cache.get(wcc)
+ cache_obj = self.cache.get(wcc)
+ return cache_obj[0] if cache_obj else None
+
+ def get_unencoded_size_with_same_compression(self, wcc):
+ cache_obj = self.cache.get(wcc)
+ return cache_obj[1] if cache_obj else None
- def add(self, tens):
+ def add(self, tens, unencoded_size):
# Adds the compressed weights from the tensor to the cache
wcc = tens.weight_compression_config
# Clone the tensor to make sure that nothing related to the weight compression is modified
tens_clone = tens.clone("_weights{}_{}".format(wcc.ofm_block_depth, wcc.ofm_depth_step))
- self.cache[wcc] = tens_clone
+ self.cache[wcc] = (tens_clone, unencoded_size)
def encode(weight_stream):
@@ -300,7 +308,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
# Cache hit, copy weights from the cache
tens.copy_compressed_weight_info(tens_cached)
set_storage_shape(tens)
- return
+ return nng.weight_cache.get_unencoded_size_with_same_compression(wcc)
# No cache hit, perform the compression
assert tens.quantization is not None
assert tens.quantization.scale_f32 is not None
@@ -321,6 +329,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
encoded_streams_substream_offsets = []
offset = 0
max_single_buffer_len = 0
+ unencoded_size = 0
ifm_bitdepth = tens.consumer_list[0].inputs[0].dtype.size_in_bits()
ifm_depth = weights.shape[-2]
@@ -371,7 +380,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
block_depth = (ofm_block_depth + arch.ncores - 1 - core) // arch.ncores
encoded_substream = []
if block_depth != 0:
- encoded_substream = encode_weights(
+ encoded_substream, raw_stream_size = encode_weights(
accelerator=arch.accelerator_config,
weights_volume=core_weights,
dilation_xy=dilation,
@@ -380,6 +389,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
is_depthwise=is_depthwise,
block_traversal=block_traversal,
)
+ unencoded_size += raw_stream_size
encoded_stream.extend(encoded_substream)
substream_offsets.append(len(encoded_stream))
@@ -408,7 +418,8 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
tens.compressed_values_substream_offsets = encoded_streams_substream_offsets
tens.brick_size = brick_size
set_storage_shape(tens)
- nng.weight_cache.add(tens)
+ nng.weight_cache.add(tens, unencoded_size)
+ return unencoded_size
def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False):
@@ -525,11 +536,11 @@ def update_pass_weight_and_scale_tensors(nng, arch):
ofm_depth_step = ps.block_config[-1]
else:
ofm_depth_step = tens.shape[-1]
- compress_weights(
+ nng.total_npu_weights += compress_weights(
arch, nng, tens, op.type.npu_block_type, ps.block_config[-1], ofm_depth_step, op.get_dilation_h_w()
)
- nng.total_compressed_weights += tens.weight_compressed_offsets[-1]
- nng.total_original_weights += tens.elements() * tens.element_size()
+ nng.total_npu_encoded_weights += tens.weight_compressed_offsets[-1]
+ nng.total_original_weights += int(tens.elements() * tens.element_size())
# Update source tensor
if needs_dma: