diff options
author | Rickard Bolin <rickard.bolin@arm.com> | 2022-05-16 09:11:06 +0000 |
---|---|---|
committer | Rickard Bolin <rickard.bolin@arm.com> | 2022-05-16 15:20:20 +0000 |
commit | fd8b500085d1ac1cca54a71631d21713a3c21f09 (patch) | |
tree | 4a8d1c7809dc1eb748f0f0b9ba2736e5d7bb5e69 /ethosu/vela/scheduler.py | |
parent | 6f4cb0362a2f00b3045565de2c27f72997b2998b (diff) | |
download | ethos-u-vela-fd8b500085d1ac1cca54a71631d21713a3c21f09.tar.gz |
MLBEDSW-6263: Use separate tensors for double buffering
Uses separate tensors for the individual weight buffers
in case of weight double buffering.
Each weight buffer tensor gets its own individual live range.
This patch is a clone of a previously reverted patch, but with some
additional bug fixes applied.
Signed-off-by: Rickard Bolin <rickard.bolin@arm.com>
Change-Id: I868c70d15821eb9f1399186f2da6e7345f6ee343
Diffstat (limited to 'ethosu/vela/scheduler.py')
-rw-r--r-- | ethosu/vela/scheduler.py | 70 |
1 files changed, 45 insertions, 25 deletions
diff --git a/ethosu/vela/scheduler.py b/ethosu/vela/scheduler.py index 3cfde28a..67b890e1 100644 --- a/ethosu/vela/scheduler.py +++ b/ethosu/vela/scheduler.py @@ -107,7 +107,7 @@ class SchedulerOpInfo: self.ofm_depth_slices: List[int] = [0, stripe.depth] self.npu_weights_tensor: Optional[NpuWeightTensor] = None self.npu_scales_tensor: Optional[NpuWeightTensor] = None - self.buffered_weight_tensor: Optional[Tensor] = None + self.buffered_weight_tensors: List[Tensor] = [] self.cycles: Optional[CycleCost] = None self.slack_buffering_cycles = 0 self.slack_buffering_memory = 0 @@ -131,9 +131,8 @@ class SchedulerOpInfo: res += f"\t\tIFM2 Stripe = {self.stripe_input2}\n" res += f"\t\tOFM Stripe = {self.stripe}\n" res += f"\t\tEncoded Weights = {self.npu_weights_tensor and len(self.npu_weights_tensor.buffer)} bytes\n" - res += ( - f"\t\tWeight buffer = {self.buffered_weight_tensor and self.buffered_weight_tensor.storage_size()} bytes\n" - ) + for idx, tens in enumerate(self.buffered_weight_tensors): + res += f"\t\tWeight buffer{idx + 1} = {tens.storage_size()} bytes\n" res += f"\t\tDepth slices = {self.ofm_depth_slices}\n" res += f"\t\tAssigned Cascade = {self.cascade}" return res @@ -734,7 +733,7 @@ class Scheduler: # Chosen buffering might not fit at all, iterate until it does # or until the minimum usable slice size is reached if ( - encoded_weights.max_range_bytes <= half_buffer_limit + encoded_weights.double_buffer_size() <= buffer_limit_bytes or prebuffer_depth == ArchitectureFeatures.OFMSplitDepth ): break @@ -751,24 +750,42 @@ class Scheduler: cost.slack_buffering_cycles = tail_cycles.op_cycles # Determine whether the weights need to be double buffered - weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes) + weight_buffer_size = min(len(encoded_weights.buffer), encoded_weights.max_range_bytes()) # Only buffer weights if there's still space left for the buffer if weight_buffer_size <= buffer_limit_bytes: assert weight_buffer_size % 16 == 0 # Determine whether to double buffer or single buffer - if (weight_buffer_size * 2 <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)): - weight_buffer_size = weight_buffer_size * 2 + double_buffer_size = encoded_weights.double_buffer_size() + if (double_buffer_size <= buffer_limit_bytes) and (weight_buffer_size < len(encoded_weights.buffer)): weight_tensor_purpose = TensorSubPurpose.DoubleBuffer else: weight_tensor_purpose = TensorSubPurpose.Standard - cost.buffered_weight_tensor = self.buffer_tensor( - encoded_weights, weight_tensor_purpose, weight_buffer_size, weight_tensor.name - ) + cost.buffered_weight_tensors = [ + self.buffer_tensor( + encoded_weights, + weight_tensor_purpose, + encoded_weights.double_buffer_sizes[0], + weight_tensor.name + "_buffer", + ) + ] + if weight_tensor_purpose == TensorSubPurpose.DoubleBuffer: + buf2 = self.buffer_tensor( + encoded_weights, + weight_tensor_purpose, + encoded_weights.double_buffer_sizes[1], + weight_tensor.name + "_buffer2", + ) + cost.buffered_weight_tensors.append(buf2) + + last_used_buffer_idx = len(cost.ofm_depth_slices) % len(cost.buffered_weight_tensors) + weight_buffer_size = encoded_weights.double_buffer_sizes[last_used_buffer_idx] + if ref_cost.cascade == 0: - # Determine if the lifetime can be extended and pre-buffer weights under the previous operation - cost.buffered_weight_tensor.pre_buffer = weight_buffer_size < slack_memory + # Determine if the lifetime can be extended and pre-buffer the first weight buffer + # under the previous operation + cost.buffered_weight_tensors[0].pre_buffer = encoded_weights.double_buffer_size() < slack_memory cost.slack_buffering_memory -= weight_buffer_size else: @@ -781,7 +798,7 @@ class Scheduler: cost.npu_scales_tensor = encoded_scales def buffer_tensor(self, src_tensor: Tensor, sub_purpose: TensorSubPurpose, buffer_size: int, name: str) -> Tensor: - buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name + "_buffer") + buffered_weight_tensor = Tensor([1, 1, 1, buffer_size], DataType.uint8, name) buffered_weight_tensor.src_tensor = src_tensor buffered_weight_tensor.mem_area = self.arch.fast_storage_mem_area buffered_weight_tensor.mem_type = MemType.Scratch_fast @@ -823,11 +840,16 @@ class Scheduler: # Create a cost entry with the new stripe cost = sched_op.create_scheduler_info(self.nng, stripe) - if ref_cost[sched_op].buffered_weight_tensor: + weight_tensor = cost.npu_weights_tensor + for idx, buffered_tens in enumerate(ref_cost[sched_op].buffered_weight_tensors): # If the weights are buffered in the reference schedule they should be in the new proposal - weight_tensor = cost.npu_weights_tensor - cost.buffered_weight_tensor = self.buffer_tensor( - weight_tensor, TensorSubPurpose.Standard, len(weight_tensor.buffer), weight_tensor.name + cost.buffered_weight_tensors.append( + self.buffer_tensor( + weight_tensor, + buffered_tens.sub_purpose, + weight_tensor.double_buffer_sizes[idx], + buffered_tens.name, + ) ) # Estimate performance @@ -856,9 +878,7 @@ class Scheduler: peak_mem_usage = max(cascade_info.mem_usage, peak_mem_usage) else: # This Op is not part of a cascade - calculate the memory usage - op_weight_buffer = 0 - if cost[sched_op].buffered_weight_tensor: - op_weight_buffer = cost[sched_op].buffered_weight_tensor.storage_size() + op_weight_buffer = sum(tens.storage_size() for tens in cost[sched_op].buffered_weight_tensors) op_mem_usage = ( sched_op.ifm_size_in_bytes() @@ -1013,8 +1033,8 @@ class Scheduler: weight_ops = {} for sched_op in self.sched_ops: cost = self.sg.schedule.cost_map[sched_op] - if cost.buffered_weight_tensor: - weight_ops[cost.buffered_weight_tensor] = sched_op + for tens in cost.buffered_weight_tensors: + weight_ops[tens] = sched_op # Filter out weight buffer live ranges weight_lrs = [] @@ -1088,8 +1108,8 @@ class Scheduler: sched_op.parent_ps.block_config = op_info.block_config.old_style_representation() # Ensure that the src_tensor reference is set correctly - if op_info.buffered_weight_tensor: - op_info.buffered_weight_tensor.src_tensor = op_info.npu_weights_tensor + for tens in op_info.buffered_weight_tensors: + tens.src_tensor = op_info.npu_weights_tensor def use_fast_storage_for_feature_maps(self, schedule, staging_limit): max_mem_usage = [] |