7 files changed, 42 insertions, 9 deletions
diff --git a/OPTIONS.md b/OPTIONS.md
index fa060bbe..fa5f4136 100644
--- a/OPTIONS.md
+++ b/OPTIONS.md
@@ -244,6 +244,18 @@ out in 1x1x16B bricks in row-major order. This enables more efficient FeatureMap
 vela network.tflite --nhcwb16-between-cascaded-passes
 ```
 
+### Scaling of weight estimates
+
+Performs an additional scaling of weight compression estimate used by Vela to estimate SRAM usage.
+Increasing this scaling factor will make the estimates more conservative (lower) and this can result
+in optimisations that use less SRAM, albeit at the cost of performance (inference speed).  
+**Type: Float**  
+**Default: 1.0**  
+
+```bash
+vela network.tflite --weight-estimation-scaling=1.2
+```
+
 ## Verbose Print Options
 
 All of the options below are disabled by default and enabling them will add
diff --git a/ethosu/vela/architecture_features.py b/ethosu/vela/architecture_features.py
index 5453f2cf..8b968a3e 100644
--- a/ethosu/vela/architecture_features.py
+++ b/ethosu/vela/architecture_features.py
@@ -184,6 +184,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
         global_memory_clock_scale,
         max_blockdep,
         softmax_support,
+        weight_estimation_scaling,
     ):
         accelerator_config = accelerator_config.lower()
         self.vela_config = vela_config
@@ -215,6 +216,7 @@ Note the difference between ArchitectureFeatures and CompilerOptions
             )
 
         self.max_blockdep = max_blockdep
+        self.weight_estimation_scaling = weight_estimation_scaling
 
         dpu_min_height = accel_config.ofm_ublock.height
         dpu_min_width = accel_config.ofm_ublock.width
diff --git a/ethosu/vela/compiler_driver.py b/ethosu/vela/compiler_driver.py
index 5e9e38fb..94900ad5 100644
--- a/ethosu/vela/compiler_driver.py
+++ b/ethosu/vela/compiler_driver.py
@@ -212,7 +212,9 @@ def compiler_driver(nng, arch, options, scheduler_options):
     if root_sg is not None and (arch.feature_map_storage_mem_area != arch.fast_storage_mem_area):
         if root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0) > arch.sram_size:
             raise VelaError(
-                "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes".format(
+                "Sram limit {} bytes, has been exceeded by the scratch fast tensor {} bytes. "
+                "Increasing the value of --weight-estimation-scaling may help to resolve the issue. "
+                "See OPTIONS.md for more information.".format(
                     arch.sram_size, root_sg.memory_used_per_type.get(MemType.Scratch_fast, 0)
                 )
             )
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py
index f3b3a79c..9a8215d5 100644
--- a/ethosu/vela/scheduler.py
+++ b/ethosu/vela/scheduler.py
@@ -608,7 +608,10 @@ class DynamicProgrammingScheduler:
         base_sram_used = 0
         for tens in ps.intermediates:
             if tens.mem_area == self.mem_area:
-                base_sram_used += tens.storage_size()
+                if tens.purpose == TensorPurpose.Weights:
+                    base_sram_used = tens.storage_size(self.arch.weight_estimation_scaling)
+                else:
+                    base_sram_used += tens.storage_size()
 
         all_block_configs = self.get_block_configs(ps)
         for block_config in all_block_configs:
@@ -718,7 +721,7 @@ class DynamicProgrammingScheduler:
                     )
                 ]
                 sram_used += ifm_tensor.storage_size_for_sub_purpose(
-                    TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
+                    self.arch, TensorSubPurpose.RollingBufferY, rolling_buffer_y, None
                 )
 
                 all_candidates.extend(self.append_sram_rewrite_list(sram_used, rewrite_list, [strat_opt]))
@@ -779,7 +782,9 @@ class DynamicProgrammingScheduler:
             for tens in ps.intermediates:
                 if tens.mem_area == self.mem_area:
                     if tens.purpose == TensorPurpose.Weights:
-                        sram_used += tens.storage_size_for_sub_purpose(TensorSubPurpose.DoubleBuffer, block_config[3])
+                        sram_used += tens.storage_size_for_sub_purpose(
+                            self.arch, TensorSubPurpose.DoubleBuffer, block_config[3]
+                        )
                         rewrite_list.append(
                             (
                                 SchedulerRewrite.ChangeTensorSubPurpose,
diff --git a/ethosu/vela/tensor.py b/ethosu/vela/tensor.py
index d4f6a409..3ad9b253 100644
--- a/ethosu/vela/tensor.py
+++ b/ethosu/vela/tensor.py
@@ -439,20 +439,25 @@ class Tensor:
     def has_fully_defined_shape(self):
         return shape_fully_defined(self.shape)
 
-    def storage_size(self):
-        raw_size = self.storage_elements() * self.element_size()
+    def storage_size(self, scale=1.0):
+        raw_size = self.storage_elements() * self.element_size() * scale
         if raw_size == 0:
             raw_size = 1  # force it to take up space
         rounded_size = numeric_util.round_up(numeric_util.round_up_to_int(raw_size), self.alignment)
         return rounded_size
 
-    def storage_size_for_sub_purpose(self, sub_purpose, param_a=None, param_b=None):
+    def storage_size_for_sub_purpose(self, arch, sub_purpose, param_a=None, param_b=None):
         alt_shape = self.storage_shape_for_sub_purpose(sub_purpose, param_a, param_b)
         elems = shape_num_elements(alt_shape)
         if elems is None:
             return 0
         if sub_purpose == TensorSubPurpose.DoubleBuffer:
-            raw_size = elems * self.element_size() * self.compression_scale_for_worst_weight_stream
+            raw_size = (
+                elems
+                * self.element_size()
+                * self.compression_scale_for_worst_weight_stream
+                * arch.weight_estimation_scaling
+            )
         else:
             # Rolling buffers are used for intermediate data in ifm streaming
             # These will all use the NHCWB16 format, and need to be aligned to 16 in the C-dimension
diff --git a/ethosu/vela/test/testutil.py b/ethosu/vela/test/testutil.py
index 116afa40..68866fc7 100644
--- a/ethosu/vela/test/testutil.py
+++ b/ethosu/vela/test/testutil.py
@@ -38,6 +38,7 @@ def create_arch():
         global_memory_clock_scale=1.0,
         max_blockdep=0,
         softmax_support=True,
+        weight_estimation_scaling=1.0,
     )
 
 
diff --git a/ethosu/vela/vela.py b/ethosu/vela/vela.py
index 97cc8736..19080926 100644
--- a/ethosu/vela/vela.py
+++ b/ethosu/vela/vela.py
@@ -260,7 +260,12 @@ def main(args=None):
         choices=[True, False],
         help="Control if Softmax should be transformed into a set of npu operations (default: %(default)s)",
     )
-
+    parser.add_argument(
+        "--weight-estimation-scaling",
+        type=float,
+        default=1.0,
+        help=("Performs an additional scaling of weight compression scale estimate (default: %(default)s)"),
+    )
     args = parser.parse_args(args=args)
 
     # Read configuration file
@@ -291,6 +296,7 @@ def main(args=None):
         global_memory_clock_scale=args.global_memory_clock_scale,
         max_blockdep=args.max_block_dependency,
         softmax_support=args.softmax_support,
+        weight_estimation_scaling=args.weight_estimation_scaling,
     )
 
     compiler_options = compiler_driver.CompilerOptions(