aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela
diff options
context:
space:
mode:
authorJacob Bohlin <jacob.bohlin@arm.com>2020-06-23 12:12:56 +0200
committerJacob Bohlin <jacob.bohlin@arm.com>2020-07-07 09:13:42 +0200
commite843d3311b8945baa32654af0dccb229b6861438 (patch)
tree335db03df5745da1ed68b62e24424f7ae3a32294 /ethosu/vela
parent42e4189689a7ded7e2a804f6263a7c588fbb66cd (diff)
downloadethos-u-vela-e843d3311b8945baa32654af0dccb229b6861438.tar.gz
MLBEDSW-2548: Fix for Double Buffer size estimate
This will give a worst case estimate of the Double Buffer size in the Scheduler and it will no longer be able to choose strategies that end up with a buffer that doesn't fit in SRAM. Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com> Change-Id: I763731f63c7672679f3b8cd6db65dad03b946ae5
Diffstat (limited to 'ethosu/vela')
-rw-r--r--ethosu/vela/graph_optimiser.py13
-rw-r--r--ethosu/vela/mark_tensors.py2
-rw-r--r--ethosu/vela/tensor.py37
-rw-r--r--ethosu/vela/weight_compressor.py55
4 files changed, 64 insertions, 43 deletions
diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index cd4ac63e..dbf2b7b9 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -435,6 +435,18 @@ def convert_depthwise_to_conv(op, arch):
return op
+def reorder_depthwise_weights(op, arch):
+ if "DepthwiseConv2d" in op.type:
+ weight_tensor = op.inputs[1]
+ weight_tensor.quant_values = np.transpose(weight_tensor.quant_values, (0, 1, 3, 2))
+ weight_tensor.shape = weight_tensor.storage_shape = weight_tensor.bandwidth_shape = list(
+ weight_tensor.quant_values.shape
+ )
+ weight_tensor.weight_transpose_depthwise = True
+
+ return op
+
+
# Reorder activation op if it's after the memory only operations
def fixup_act_reorder(op, arch):
if op.type in activation_ops:
@@ -589,6 +601,7 @@ def optimise_graph_a(nng, arch, verbose_graph=False):
add_padding_fields,
mark_npu_block_type,
fixup_elementwise_with_scalars,
+ reorder_depthwise_weights,
# convert_mul_max_to_abs_or_lrelu # TODO: enable optimisation once quantisation issues are resolved
]
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index 705f839b..5f3a13fa 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -368,7 +368,7 @@ def mark_tensor_format(nng, arch, verbose_tensor_format=False):
if src_tens is not None:
op = tens.find_npu_op()
npu_block_type = op.attrs["npu_block_type"]
- weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 32, 32, op.get_dilation_h_w())
+ weight_compressor.compress_weights(arch, nng, tens, npu_block_type, 16, 16, op.get_dilation_h_w())
# Alias compressed weights back into source tensor
src_tens.copy_compressed_weight_info(tens)
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index bc0597f6..5e97cfe8 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -414,27 +414,30 @@ class Tensor:
return rounded_size
def storage_shape_for_sub_purpose(self, sub_purpose, param_a, param_b):
- shp = list(self.storage_shape)
if sub_purpose == TensorSubPurpose.DoubleBuffer:
+ shp = list(self.shape)
assert len(shp) >= 2
shp[-1] = min(shp[-1], param_a * 2)
- elif sub_purpose == TensorSubPurpose.RollingBufferX:
- assert len(shp) == 4
- shp[0] = 1
- shp[2] = min(shp[2], param_a)
- elif sub_purpose == TensorSubPurpose.RollingBufferY:
- assert len(shp) == 4
- shp[0] = 1
- shp[1] = min(shp[1], param_a)
- elif sub_purpose == TensorSubPurpose.RollingBufferXY:
- assert len(shp) == 4
- shp[0] = 1
- shp[2] = min(shp[2], param_a)
- shp[1] = min(shp[1], param_b)
- elif sub_purpose == TensorSubPurpose.Standard:
- pass
else:
- assert 0, "did not expect new sub purpose %s" % (sub_purpose,)
+ shp = list(self.storage_shape)
+ if sub_purpose == TensorSubPurpose.RollingBufferX:
+ assert len(shp) == 4
+ shp[0] = 1
+ shp[2] = min(shp[2], param_a)
+ elif sub_purpose == TensorSubPurpose.RollingBufferY:
+ assert len(shp) == 4
+ shp[0] = 1
+ shp[1] = min(shp[1], param_a)
+ elif sub_purpose == TensorSubPurpose.RollingBufferXY:
+ assert len(shp) == 4
+ shp[0] = 1
+ shp[2] = min(shp[2], param_a)
+ shp[1] = min(shp[1], param_b)
+ elif sub_purpose == TensorSubPurpose.Standard:
+ pass
+ else:
+ assert 0, "did not expect new sub purpose %s" % (sub_purpose,)
+
return shp
def set_new_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index d3562891..2554b7c8 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -19,7 +19,6 @@ import math
from collections import namedtuple
import numpy as np
-from ethosu import mlw_codec
from .data_type import DataType
from .errors import UnsupportedFeatureError
@@ -32,6 +31,7 @@ from .tensor import TensorBlockTraversal
from .tensor import TensorFormat
from .tensor import TensorPurpose
from .tensor import TensorSubPurpose
+from ethosu import mlw_codec
# Contains meta info for a weight compression. If two tensors have identical weight compression config,
@@ -177,10 +177,12 @@ def generate_brick(arch, brick_weights, ofm_block_depth, block_traversal, ifm_bi
stream.append(brick_weights[ofm_z][wy][wx][ifm_z])
return stream
+
def core_deinterleave(hwio, core, ncores):
# Put weights back into OHWI
- ohwi = np.transpose(hwio, (3,0,1,2))
- return ohwi[core:ohwi.shape[0]:ncores]
+ ohwi = np.transpose(hwio, (3, 0, 1, 2))
+ return ohwi[core : ohwi.shape[0] : ncores]
+
# Compress the weights
def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth_step, dilation):
@@ -244,6 +246,10 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
# Transpose Convoluion, reverse weights in H and W axes
weights = np.flip(weights, axis=(0, 1))
+ # Calculate brick size
+ brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step))
+ elements_in_brick = np.prod(brick_size)
+
# Slice weight stream up depth-ways into bricks and compress
full_ofm_depth = quant_buf.shape[-1]
for idx in range(0, full_ofm_depth, ofm_depth_step):
@@ -262,17 +268,19 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
block_depth = (ofm_block_depth + arch.ncores - 1 - core) // arch.ncores
if block_depth != 0:
- raw_stream = generate_brick(arch, core_weights, block_depth, tens.block_traversal, ifm_bitdepth, dilation)
+ raw_stream = generate_brick(
+ arch, core_weights, block_depth, tens.block_traversal, ifm_bitdepth, dilation
+ )
else:
raw_stream = []
- raw_size += len( raw_stream )
- encoded_substream = encode( raw_stream )
- encoded_stream.extend( encoded_substream )
- substream_offsets.append( len(encoded_stream) )
+ raw_size += len(raw_stream)
+ encoded_substream = encode(raw_stream)
+ encoded_stream.extend(encoded_substream)
+ substream_offsets.append(len(encoded_stream))
- encoded_streams.append( encoded_stream )
- encoded_streams_substream_offsets.append( substream_offsets )
+ encoded_streams.append(encoded_stream)
+ encoded_streams_substream_offsets.append(substream_offsets)
# Remember maximum encoded length for DoubleBuffering
max_single_buffer_len = max(max_single_buffer_len, len(encoded_stream))
@@ -283,7 +291,7 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
assert offset % 16 == 0
# Compression scale tracking
- compression_scales.append(len(encoded_stream) / raw_size)
+ compression_scales.append(len(encoded_stream) / elements_in_brick)
# Track total length as last element of the offsets array
compressed_offsets.append(offset)
@@ -294,10 +302,11 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
tens.storage_compression_scale = tens.bandwidth_compression_scale = np.average(compression_scales)
tens.compressed_values = encoded_streams
tens.compressed_values_substream_offsets = encoded_streams_substream_offsets
- tens.brick_size = (weights_shape[0], weights_shape[1], weights_shape[2], min(tens.shape[-1], ofm_depth_step))
+ tens.brick_size = brick_size
set_storage_shape(tens)
nng.weight_cache.add(tens)
+
def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=False):
assert tens.purpose == TensorPurpose.FeatureMap
assert tens.format == TensorFormat.NHWC
@@ -399,24 +408,25 @@ def calc_scales_and_pack_biases(tens, arch, ofm_depth_step, rescale_for_faf=Fals
substream_offsets = [0]
max_len = min(ofm_depth_step, total_elements - i)
for core in range(0, min(arch.ncores, max_len)):
- core_scales = quantised_scales[i+core:i+core+max_len:arch.ncores]
- core_biases = biases[i+core:i+core+max_len:arch.ncores]
+ core_scales = quantised_scales[i + core : i + core + max_len : arch.ncores]
+ core_biases = biases[i + core : i + core + max_len : arch.ncores]
for j, core_bias in enumerate(core_biases):
- stream.extend( pack_bias_and_scale(core_bias, *core_scales[j]) )
+ stream.extend(pack_bias_and_scale(core_bias, *core_scales[j]))
# Align to 16 for start for next substream
- remainder = ( len(stream) ) % 16
+ remainder = (len(stream)) % 16
if remainder > 0:
- stream.extend( bytearray(16 - remainder) )
+ stream.extend(bytearray(16 - remainder))
- substream_offsets.append( len(stream) )
+ substream_offsets.append(len(stream))
# Add to compressed values with their substream offset lists to the tensor
- tens.compressed_values.append( stream )
- tens.compressed_values_substream_offsets.append( substream_offsets )
+ tens.compressed_values.append(stream)
+ tens.compressed_values_substream_offsets.append(substream_offsets)
tens.storage_shape = [total_elements * tens.element_size_bytes]
+
def update_pass_weight_and_scale_tensors(nng, arch):
for sg in nng.subgraphs:
for ps in sg.passes:
@@ -424,11 +434,6 @@ def update_pass_weight_and_scale_tensors(nng, arch):
if tens is not None:
op = tens.find_npu_op()
npu_usage_of_tensor = op.attrs["npu_block_type"]
- if npu_usage_of_tensor == NpuBlockType.ConvolutionDepthWise:
- tens.quant_values = np.transpose(tens.quant_values, (0, 1, 3, 2))
- tens.shape = tens.storage_shape = tens.bandwidth_shape = list(tens.quant_values.shape)
- tens.weight_transpose_depthwise = True
-
needs_dma = tens.needs_dma()
if ps.cascade.strategy == SchedulingStrategy.WeightStream and needs_dma:
ofm_depth_step = ps.block_config[-1]