3 files changed, 17 insertions, 1 deletions
diff --git a/ethosu/vela/architecture_allocator.py b/ethosu/vela/architecture_allocator.py
index e43b841d..86410cfd 100644
--- a/ethosu/vela/architecture_allocator.py
+++ b/ethosu/vela/architecture_allocator.py
@@ -196,6 +196,15 @@ def _get_ifm_blocksize(
     return Shape4D(1, height, width, ofm_block.depth)
 
 
+def fit_block_for_ofm(arch: ArchitectureFeatures, ofm_shape: Shape4D, kernel: Kernel, block: Shape4D):
+    # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes) This is a specific
+    # interpretation of a more general constraint that can't be applied because the
+    # find_block_config function must return block configs that can be applied to any OFM shape.
+    if (ofm_shape.height == 1) and (kernel.height == 1) and (arch.ofm_ublock.height == 2):
+        return Shape4D(1, min(block.height, ofm_shape.height), block.width, block.depth)
+    return block
+
+
 def find_block_config(
     arch: ArchitectureFeatures,
     npu_op_type: NpuBlockType,
@@ -274,6 +283,7 @@ def find_block_config(
                     ifm_block = ifm_block.with_depth(ifm_blockdepth)
 
                 # Test if the IFM/OFM blocks fit into SHRAM
+                ofm_block = fit_block_for_ofm(arch, ofm_shape, kernel, ofm_block)
                 layout = _try_block_config(
                     arch.shram, ew_usage, ofm_block, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
                 )
@@ -304,7 +314,7 @@ def find_block_config(
                         config.layout = layout
                         config.bank_size = arch.shram_bank_size
                         config.ifm_block = ifm_block
-                        config.ofm_block = ofm_block
+                        config.ofm_block = Shape4D(1, height, width, depth)
                 else:
                     wont_fit[(width, height)] = True
 
@@ -322,6 +332,7 @@ def try_block_config(
     block_config: Block,
     arch: ArchitectureFeatures,
     npu_op_type: NpuBlockType,
+    ofm_shape: Block,
     ifm_shape: Block,
     ifm2_shape: Optional[Block],
     uses_scalar: bool,
@@ -374,6 +385,9 @@ def try_block_config(
     if not is_equal_depth_op:
         ifm_block = ifm_block.with_depth(ifm_blockdepth)
 
+    # 256/512 Conv1D optimisation (ratio of IFM:Accumulators changes)
+    block_config = fit_block_for_ofm(arch, ofm_shape, kernel, block_config)
+
     layout = _try_block_config(
         arch.shram, ew_usage, block_config, ifm_block, ifm_bits, ifm_granule, acc_bits, acc_granule, lut_banks
     )
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 20431273..b4a633e9 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -583,6 +583,7 @@ def get_arch_block_config(
         block_config,
         arch,
         block_type,
+        npu_op.ofm.shape,
         ifm_shape,
         ifm2_shape,
         uses_scalar,
diff --git a/ethosu/vela/test/test_architecture_allocator.py b/ethosu/vela/test/test_architecture_allocator.py
index 94768fc1..a199e9d8 100644
--- a/ethosu/vela/test/test_architecture_allocator.py
+++ b/ethosu/vela/test/test_architecture_allocator.py
@@ -106,6 +106,7 @@ def test_allocate(test_data):
         Block.from_shape(config.ofm_block.as_list()),
         arch,
         block_type,
+        ofm_shape,
         ifm_shape,
         ifm2_shape,
         is_partkernel=config.is_partkernel,