aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--OPTIONS.md12
-rw-r--r--ethosu/vela/architecture_features.py2
-rw-r--r--ethosu/vela/compiler_driver.py4
-rw-r--r--ethosu/vela/scheduler.py11
-rw-r--r--ethosu/vela/tensor.py13
-rw-r--r--ethosu/vela/test/testutil.py1
-rw-r--r--ethosu/vela/vela.py8
7 files changed, 42 insertions, 9 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index fa060bbe..fa5f4136 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -244,6 +244,18 @@ out in 1x1x16B bricks in row-major order. This enables more efficient FeatureMap
vela network.tflite --nhcwb16-between-cascaded-passes
```
+### Scaling of weight estimates
+
+Performs an additional scaling of weight compression estimate used by Vela to estimate SRAM usage.
+Increasing this scaling factor will make the estimates more conservative (lower) and this can result
+in optimisations that use less SRAM, albeit at the cost of performance (inference speed).
+**Type: Float**
+**Default: 1.0**
+
+```bash
+vela network.tflite --weight-estimation-scaling=1.2
+```
+
## Verbose Print Options
All of the options below are disabled by default and enabling them will add
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 5453f2cf..8b968a3e 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -184,6 +184,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
global_memory_clock_scale,
max_blockdep,
softmax_support,
+ weight_estimation_scaling,
):
accelerator_config = accelerator_config.lower()
self.vela_config = vela_config
@@ -215,6 +216,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
)
self.max_blockdep = max_blockdep
+ self.weight_estimation_scaling = weight_estimation_scaling
dpu_min_height = accel_config.ofm_ublock.height
dpu_min_width = accel_config.ofm_ublock.width
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 5e9e38fb..94900ad5 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -212,7 +212,9 @@ def compiler_driver(nng, arch, options, scheduler_options):
if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
raise VelaError(
- "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes".format(
+ "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes. "
+ "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
+ "See OPTIONS.md for more information.".format(
arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)
)
)
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index f3b3a79c..9a8215d5 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -608,7 +608,10 @@ class DynamicProgrammingScheduler:
base_sram_used = 0
for tens in ps.intermediates:
if tens.mem_area == self.mem_area:
- base_sram_used += tens.storage_size()
+ if tens.purpose == TensorPurpose.Weights:
+ base_sram_used = tens.storage_size(self.arch.weight_estimation_scaling)
+ else:
+ base_sram_used += tens.storage_size()
all_block_configs = self.get_block_configs(ps)
for block_config in all_block_configs:
@@ -718,7 +721,7 @@ class DynamicProgrammingScheduler:
)
]
sram_used += ifm_tensor.storage_size_for_sub_purpose(
- TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
+ self.arch, TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
)
all_candidates.extend(self.append_sram_rewrite_list(sram_used, rewrite_list, [strat_opt]))
@@ -779,7 +782,9 @@ class DynamicProgrammingScheduler:
for tens in ps.intermediates:
if tens.mem_area == self.mem_area:
if tens.purpose == TensorPurpose.Weights:
- sram_used += tens.storage_size_for_sub_purpose(TensorSubPurpose.DoubleBuffer, block_config[3])
+ sram_used += tens.storage_size_for_sub_purpose(
+ self.arch, TensorSubPurpose.DoubleBuffer, block_config[3]
+ )
rewrite_list.append(
(
SchedulerRewrite.ChangeTensorSubPurpose,
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index d4f6a409..3ad9b253 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -439,20 +439,25 @@ class Tensor:
def has_fully_defined_shape(self):
return shape_fully_defined(self.shape)
- def storage_size(self):
- raw_size = self.storage_elements() * self.element_size()
+ def storage_size(self, scale=1.0):
+ raw_size = self.storage_elements() * self.element_size() * scale
if raw_size == 0:
raw_size = 1 # force it to take up space
rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment)
return rounded_size
- def storage_size_for_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
+ def storage_size_for_sub_purpose(self, arch, sub_purpose, param_a=None, param_b=None):
alt_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b)
elems = shape_num_elements(alt_shape)
if elems is None:
return 0
if sub_purpose == TensorSubPurpose.DoubleBuffer:
- raw_size = elems * self.element_size() * self.compression_scale_for_worst_weight_stream
+ raw_size = (
+ elems
+ * self.element_size()
+ * self.compression_scale_for_worst_weight_stream
+ * arch.weight_estimation_scaling
+ )
else:
# Rolling buffers are used for intermediate data in ifm streaming
# These will all use the NHCWB16 format, and need to be aligned to 16 in the C-dimension
diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py
index 116afa40..68866fc7 100644
--- a/ethosu/vela/test/testutil.py
+++ b/ethosu/vela/test/testutil.py
@@ -38,6 +38,7 @@ def create_arch():
global_memory_clock_scale=1.0,
max_blockdep=0,
softmax_support=True,
+ weight_estimation_scaling=1.0,
)
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 97cc8736..19080926 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -260,7 +260,12 @@ def main(args=None):
choices=[True, False],
help="Control if Softmax should be transformed into a set of npu operations (default: %(default)s)",
)
-
+ parser.add_argument(
+ "--weight-estimation-scaling",
+ type=float,
+ default=1.0,
+ help=("Performs an additional scaling of weight compression scale estimate (default: %(default)s)"),
+ )
args = parser.parse_args(args=args)
# Read configuration file
@@ -291,6 +296,7 @@ def main(args=None):
global_memory_clock_scale=args.global_memory_clock_scale,
max_blockdep=args.max_block_dependency,
softmax_support=args.softmax_support,
+ weight_estimation_scaling=args.weight_estimation_scaling,
)
compiler_options = compiler_driver.CompilerOptions(