aboutsummaryrefslogtreecommitdiff
path: root/ethosu/vela/high_level_command_to_npu_op.py
diff options
context:
space:
mode:
Diffstat (limited to 'ethosu/vela/high_level_command_to_npu_op.py')
-rw-r--r--ethosu/vela/high_level_command_to_npu_op.py497
1 files changed, 497 insertions, 0 deletions
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py
new file mode 100644
index 00000000..77501210
--- /dev/null
+++ b/ethosu/vela/high_level_command_to_npu_op.py
@@ -0,0 +1,497 @@
+# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved.
+#
+# SPDX-License-Identifier: Apache-2.0
+#
+# Licensed under the Apache License, Version 2.0 (the License); you may
+# not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an AS IS BASIS, WITHOUT
+# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+#
+# Description:
+# Conversion from high level command to NpuOperation
+from enum import IntEnum
+from typing import List
+from typing import Optional
+
+from .api import NpuActivation
+from .api import NpuActivationOp
+from .api import NpuAddressRange
+from .api import NpuBlockOperation
+from .api import NpuBlockTraversal
+from .api import NpuConv2DOperation
+from .api import NpuConvDepthWiseOperation
+from .api import NpuDataType
+from .api import NpuDmaOperation
+from .api import NpuElementWiseOp
+from .api import NpuElementWiseOperation
+from .api import NpuFeatureMap
+from .api import NpuKernel
+from .api import NpuLayout
+from .api import NpuOperation
+from .api import NpuPadding
+from .api import NpuPoolingOp
+from .api import NpuPoolingOperation
+from .api import NpuQuantization
+from .api import NpuResamplingMode
+from .api import NpuRoundingMode
+from .api import NpuShape3D
+from .api import NpuTileBox
+from .architecture_features import ArchitectureFeatures
+from .data_type import DataType
+from .high_level_command_stream import Box
+from .high_level_command_stream import Command
+from .high_level_command_stream import CommandType
+from .high_level_command_stream import DMA
+from .high_level_command_stream import NpuStripe
+from .operation import Kernel
+from .operation import NpuBlockType
+from .operation import Op
+from .operation import Operation
+from .tensor import MemType
+from .tensor import Tensor
+from .tensor import TensorBlockTraversal
+from .tensor import TensorFormat
+from .tensor import TensorPurpose
+
+
+unary_elementwise_ops = set((NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ,))
+
+
+class BasePointerIndex(IntEnum):
+ WeightTensor = 0 # base address index for the Weight tensor
+ ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena
+ ScratchFastTensor = 2 # base address for the Scratch_fast_tensor
+ Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer
+
+
+dtype_map = {
+ DataType.uint8: NpuDataType.UINT8,
+ DataType.int8: NpuDataType.INT8,
+ DataType.uint16: NpuDataType.UINT16,
+ DataType.int16: NpuDataType.INT16,
+ DataType.int32: NpuDataType.INT32,
+}
+
+
+block_traversal_map = {
+ TensorBlockTraversal.DepthFirst: NpuBlockTraversal.DEPTH_FIRST,
+ TensorBlockTraversal.PartKernelFirst: NpuBlockTraversal.PART_KERNEL_FIRST,
+}
+
+
+# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE
+elementwise_op_map = {
+ Op.Mul: NpuElementWiseOp.MUL,
+ Op.Add: NpuElementWiseOp.ADD,
+ Op.Sub: NpuElementWiseOp.SUB,
+ Op.Minimum: NpuElementWiseOp.MIN,
+ Op.Maximum: NpuElementWiseOp.MAX,
+ Op.LeakyRelu: NpuElementWiseOp.LRELU,
+ Op.Abs: NpuElementWiseOp.ABS,
+ Op.CLZ: NpuElementWiseOp.CLZ,
+ Op.SHR: NpuElementWiseOp.SHR,
+ Op.SHL: NpuElementWiseOp.SHL,
+}
+
+
+def to_npu_kernel(kernel: Kernel) -> NpuKernel:
+ """Converts the given internally used kernel object to NpuKernel (of public API)"""
+ return NpuKernel(
+ kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y
+ )
+
+
+def to_kernel(kernel: Optional[NpuKernel]) -> Kernel:
+ """Converts the given public API object to Kernel (used internally)"""
+ if kernel is None:
+ return Kernel(1, 1)
+ return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y)
+
+
+def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool:
+ if ifm_shape == []:
+ # Scalar needs to be in IFM2
+ return False
+ if ifm2_shape == []:
+ return True
+
+ for ifm, ifm2 in zip(ifm_shape, ifm2_shape):
+ if ifm != ifm2 and ifm == 1:
+ # Broadcasted FM needs to be in IFM2
+ return False
+ return True
+
+
+def get_rounding_mode(op: Operation) -> NpuRoundingMode:
+ """Specifies type of rounding to be used"""
+ rounding_mode = NpuRoundingMode.TFL
+ if op.type == Op.ResizeBilinear:
+ rounding_mode = NpuRoundingMode.TRUNCATE
+ elif (
+ op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise)
+ and op.ifm.dtype == DataType.int16
+ ):
+ rounding_mode = NpuRoundingMode.NATURAL
+ elif op.type.is_avgpool_op() and op.memory_function == Op.ConcatSliceWrite and op.kernel.elements_wh() == 1:
+ rounding_mode = NpuRoundingMode.NATURAL
+ rounding_mode = op.attrs.get("rounding_mode", rounding_mode)
+ return rounding_mode
+
+
+def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding:
+ if primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
+ return NpuPadding(top=0, left=0, bottom=0, right=0)
+ explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right)
+
+ # Check if this is for horizontal ifm streaming
+ if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe):
+ explicit_padding[0] = cmd.pad_top
+ explicit_padding[2] = cmd.pad_bottom
+
+ # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output,
+ # because of activation function needed to be fused.
+ if cmd.ifm_box.start_coord[-2] > 0:
+ explicit_padding[1] = 0
+ if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]:
+ explicit_padding[3] = 0
+ return NpuPadding(
+ top=explicit_padding[0], left=explicit_padding[1], bottom=explicit_padding[2], right=explicit_padding[3]
+ )
+
+
+def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int:
+ if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area:
+ base_ptr_idx_map = {
+ MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+ MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+ MemType.Scratch: BasePointerIndex.ScratchTensor,
+ MemType.Scratch_fast: BasePointerIndex.ScratchTensor,
+ }
+ else:
+ base_ptr_idx_map = {
+ MemType.Permanent_NPU: BasePointerIndex.WeightTensor,
+ MemType.Permanent_CPU: BasePointerIndex.WeightTensor,
+ MemType.Scratch: BasePointerIndex.ScratchTensor,
+ MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor,
+ }
+ return int(base_ptr_idx_map[tens.mem_type])
+
+
+def get_upscale(op: Operation) -> NpuResamplingMode:
+ upscale = NpuResamplingMode.NONE
+ if op.type == Op.ResizeBilinear:
+ # perform nearest neighbor upscale
+ upscale = NpuResamplingMode.NEAREST
+ elif op.type == Op.Conv2DBackpropInputSwitchedBias:
+ # perform insert zero upscale
+ upscale = NpuResamplingMode.TRANSPOSE
+ return upscale
+
+
+def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int:
+ if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
+ shape = ifm_box.get_size_shape()
+ else:
+ shape = ofm_box.get_size_shape()
+ return shape[-1]
+
+
+def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool:
+ """Checks if quantization should use 0 as zero point"""
+ if tens.dtype == DataType.int32 and is_ifm_tensor:
+ return True
+ if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL):
+ return False
+ fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
+ forced_ofm_quantization = ps.primary_op.forced_output_quantization
+ use_0 = (
+ (ps.primary_op.activation is None or forced_ofm_quantization is not None)
+ and (ps.primary_op.memory_function != Op.ConcatSliceWrite)
+ and not fused_quantize
+ )
+ return use_0
+
+
+def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
+ """Gets quantization for IFM/IFM2"""
+ if tens.quantization is None:
+ return None
+ if use_zero_point_0(ps, tens, True):
+ zero_point = 0
+ else:
+ zero_point = int(tens.quantization.zero_point)
+ return NpuQuantization(scale_f32=tens.quantization.scale_f32, zero_point=zero_point)
+
+
+def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]:
+ """Gets quantization for OFM"""
+ op = ps.primary_op
+ # Check if operation's output quantization is should be used instead of the output tensor's quantization
+ # (used in LUTs)
+ ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization
+ if ofm_quant is None:
+ return None
+ if use_zero_point_0(ps, tens, False):
+ zero_point = 0
+ else:
+ zero_point = int(ofm_quant.zero_point)
+ return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point)
+
+
+def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures) -> NpuFeatureMap:
+ """Creates feature map with common fields populated"""
+ fm = NpuFeatureMap()
+ fm.region = get_region(tens, arch)
+ fm.data_type = dtype_map[tens.dtype]
+ if tens.format == TensorFormat.NHWC:
+ fm.layout = NpuLayout.NHWC
+ elif tens.format == TensorFormat.NHCWB16:
+ fm.layout = NpuLayout.NHCWB16
+ else:
+ assert 0, "Incorrect tensor format"
+ height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(box.start_coord, box.end_coord)
+ for idx, addr in enumerate(addresses):
+ if addr is None:
+ addresses[idx] = 0
+ fm.tiles = NpuTileBox(
+ height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses]
+ )
+ strides = tens.get_strides()
+ fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1]))
+ return fm
+
+
+def create_weights(weight_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures) -> List[NpuAddressRange]:
+ """Returns address ranges for weights"""
+ weights = []
+ stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
+ weight_substream_offsets = weight_tensor.compressed_values_substream_offsets[stream_index]
+ substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length
+
+ # Extract weight substream offsets and calculate their lengths
+ assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0)
+ weight_addr = weight_tensor.address_for_coordinate(weight_box.start_coord)
+ region = get_region(weight_tensor, arch)
+ for core in range(substreams):
+ address = weight_addr + weight_substream_offsets[core]
+ length = weight_substream_offsets[core + 1] - weight_substream_offsets[core]
+ addr_range = NpuAddressRange(region, int(address), int(length))
+ weights.append(addr_range)
+ return weights
+
+
+def create_biases(
+ weight_tensor: Tensor, scale_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures
+) -> List[NpuAddressRange]:
+ """Returns address ranges for biases"""
+ biases = []
+ stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord)
+ scale_substream_offsets = scale_tensor.compressed_values_substream_offsets[stream_index]
+ substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length
+
+ # Extract scale substream offsets and calculate their lengths
+ assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0)
+ scale_addr = scale_tensor.address_for_coordinate(weight_box.start_coord[-1:])
+
+ region = get_region(scale_tensor, arch)
+ for core in range(substreams):
+ address = scale_addr + scale_substream_offsets[core]
+ length = scale_substream_offsets[core + 1] - scale_substream_offsets[core]
+ addr_range = NpuAddressRange(region, int(address), int(length))
+ biases.append(addr_range)
+ return biases
+
+
+def create_npu_activation(op: Operation) -> NpuActivation:
+ """Creates fused activation function"""
+ if op.activation is None:
+ return NpuActivation(NpuActivationOp.NONE_OR_RELU)
+ faf = op.activation.op_type
+ act_op = NpuActivationOp.NONE_OR_RELU
+ if faf == Op.Tanh:
+ act_op = NpuActivationOp.TANH
+ elif faf == Op.Sigmoid:
+ act_op = NpuActivationOp.SIGMOID
+ elif faf == Op.LUT:
+ act_op = NpuActivationOp.TABLE_LOOKUP
+ elif not faf.is_relu_op():
+ raise Exception("Unsupported fused_activation_function = " + faf.name)
+
+ act = NpuActivation(act_op)
+ act.min = op.activation.min
+ act.max = op.activation.max
+ act.lookup_table_index = op.activation.lut_index
+ return act
+
+
+def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures):
+ """Sets common fields of the given operation"""
+ ps = cmd.ps
+ op = ps.primary_op
+ in_shape = cmd.ifm_box.get_size_shape()
+ out_shape = cmd.ofm_box.get_size_shape()
+ ofm_height = out_shape[-3] if len(out_shape) >= 4 else 1
+ ofm_width = out_shape[-2] if len(out_shape) >= 2 else 1
+ ofm_depth = out_shape[-1] if len(out_shape) >= 1 else 1
+ ifm_height = in_shape[-3] if len(in_shape) >= 4 else 1
+ if op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum):
+ ifm_depth = in_shape[-1] if len(in_shape) >= 1 else 1
+ else:
+ ifm_depth = ofm_depth
+
+ npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch)
+ npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=cmd.ifm_tensor.shape[-2], depth=ifm_depth)
+ npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor)
+ npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch)
+ npu_op.ofm.shape = NpuShape3D(height=ofm_height, width=ofm_width, depth=ofm_depth)
+ npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor)
+
+ if cmd.weight_tensor is not None:
+ npu_op.weights = create_weights(cmd.weight_tensor, cmd.weight_box, arch)
+ if cmd.scale_tensor is not None:
+ npu_op.biases = create_biases(cmd.weight_tensor, cmd.scale_tensor, cmd.weight_box, arch)
+ npu_op.activation = create_npu_activation(op)
+ npu_op.rounding_mode = get_rounding_mode(op)
+ npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3])
+
+ if not op.type.is_elementwise_op():
+ npu_op.padding = create_padding(cmd, op)
+ npu_op.kernel = to_npu_kernel(op.kernel)
+ npu_op.ifm_upscale = get_upscale(op)
+ npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops)
+ return npu_op
+
+
+def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation:
+ """Converts the command to NpuConv2DOperation"""
+ npu_op = NpuConv2DOperation()
+ set_common_op_fields(npu_op, cmd, arch)
+ if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct:
+ npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST
+ else:
+ npu_op.block_traversal = block_traversal_map[cmd.weight_tensor.block_traversal]
+ return npu_op
+
+
+def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation:
+ """Converts the command to NpuConvDepthWiseOperation"""
+ npu_op = NpuConvDepthWiseOperation()
+ set_common_op_fields(npu_op, cmd, arch)
+ return npu_op
+
+
+def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation:
+ """Converts the command to NpuPoolingOperation"""
+ ps = cmd.ps
+ op = ps.primary_op
+ pool_op = NpuPoolingOp.AVERAGE
+ if op.type.is_maxpool_op():
+ pool_op = NpuPoolingOp.MAX
+ elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear:
+ pool_op = NpuPoolingOp.AVERAGE
+ elif op.type == Op.ReduceSum:
+ pool_op = NpuPoolingOp.REDUCE_SUM
+ else:
+ assert 0, f"Unknown pool type {op.type}"
+ npu_op = NpuPoolingOperation(pool_op)
+ set_common_op_fields(npu_op, cmd, arch)
+ # Pooling specific info
+ if op.type == Op.ResizeBilinear and "rescale" in op.attrs:
+ npu_op.rescale = op.attrs["rescale"]
+ return npu_op
+
+
+def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation:
+ """Converts the command to NpuElementWiseOperation"""
+ ps = cmd.ps
+ op = ps.primary_op
+ assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}"
+ elemwise_op = elementwise_op_map[op.type]
+ npu_op = NpuElementWiseOperation(elemwise_op)
+ if elemwise_op not in unary_elementwise_ops:
+ if not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape):
+ # The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms
+ cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor
+ cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box
+ npu_op.reversed_operands = True
+ npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch)
+ npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor)
+ if cmd.ifm2_tensor.shape == []:
+ # scalar
+ assert cmd.ifm2_tensor.quant_values.size == 1
+ npu_op.ifm2_scalar = cmd.ifm2_tensor.values.item(0)
+ npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0)
+ else:
+ box_shp = cmd.ifm2_box.get_size_shape()
+ height = box_shp[-3] if len(box_shp) >= 3 else 1
+ npu_op.ifm2.shape = NpuShape3D(height=height, width=cmd.ifm2_tensor.shape[-2], depth=box_shp[-1])
+ set_common_op_fields(npu_op, cmd, arch)
+ # Check if output scale needs to be overridden
+ output_scale = None
+ if op.type == Op.Add and "resizebilinear" in op.attrs:
+ # Force output scale same as the input scale for
+ # resizebilinear 1x1 that is converted to add
+ output_scale = npu_op.ifm2.quantization.scale_f32
+ if op.type == Op.LeakyRelu:
+ output_scale = op.attrs["alpha"]
+ if op.type in (Op.Add, Op.Sub) and "rescale" in op.attrs:
+ npu_op.rescale = op.attrs.get("rescale")
+ if op.type in (Op.Add, Op.Mul, Op.Sub):
+ if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh):
+ output_scale = 1 / 0x3000
+ if output_scale is not None:
+ npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point)
+ return npu_op
+
+
+def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation:
+ """Converts the command to NpuDmaOperation"""
+ src_region = get_region(cmd.in_tensor, arch)
+ if cmd.out_tensor.purpose == TensorPurpose.LUT:
+ dest_region = BasePointerIndex.Mem2Mem
+ else:
+ dest_region = get_region(cmd.out_tensor, arch)
+
+ start_coord = cmd.box.start_coord
+ src_addr = cmd.in_tensor.address_for_coordinate(start_coord)
+ dest_addr = cmd.out_tensor.address_for_coordinate(start_coord)
+
+ if cmd.in_tensor.compressed_values is not None:
+ if cmd.out_tensor.purpose == TensorPurpose.FSBias:
+ sz = cmd.in_tensor.storage_size()
+ else:
+ stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord)
+ sz = cmd.in_tensor.size_of_compressed_stream(stream_index)
+ else:
+ sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr
+ src = NpuAddressRange(src_region, int(src_addr), int(sz))
+ dest = NpuAddressRange(dest_region, int(dest_addr), int(sz))
+ return NpuDmaOperation(src, dest)
+
+
+def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation:
+ """Converts the high level command to NpuOperation"""
+ if cmd.cmdtype == CommandType.DMA:
+ npu_op = create_dma_op(cmd, arch)
+ elif cmd.cmdtype == CommandType.NpuStripe:
+ npu_block_type = cmd.ps.primary_op.type.npu_block_type
+ if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct):
+ npu_op = create_npu_conv2d_op(cmd, arch)
+ elif npu_block_type == NpuBlockType.ConvolutionDepthWise:
+ npu_op = create_npu_conv_depthwise_op(cmd, arch)
+ elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum):
+ npu_op = create_npu_pool_op(cmd, arch)
+ elif npu_block_type == NpuBlockType.ElementWise:
+ npu_op = create_npu_elementwise_op(cmd, arch)
+ else:
+ assert 0, f"Unknown command type {npu_block_type}"
+ # add a link to the high level command for debugging purposes
+ npu_op.cmd = cmd
+ return npu_op