diff options
Diffstat (limited to 'ethosu/vela/high_level_command_to_npu_op.py')
-rw-r--r-- | ethosu/vela/high_level_command_to_npu_op.py | 497 |
1 files changed, 497 insertions, 0 deletions
diff --git a/ethosu/vela/high_level_command_to_npu_op.py b/ethosu/vela/high_level_command_to_npu_op.py new file mode 100644 index 00000000..77501210 --- /dev/null +++ b/ethosu/vela/high_level_command_to_npu_op.py @@ -0,0 +1,497 @@ +# Copyright (C) 2020 Arm Limited or its affiliates. All rights reserved. +# +# SPDX-License-Identifier: Apache-2.0 +# +# Licensed under the Apache License, Version 2.0 (the License); you may +# not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an AS IS BASIS, WITHOUT +# WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# +# Description: +# Conversion from high level command to NpuOperation +from enum import IntEnum +from typing import List +from typing import Optional + +from .api import NpuActivation +from .api import NpuActivationOp +from .api import NpuAddressRange +from .api import NpuBlockOperation +from .api import NpuBlockTraversal +from .api import NpuConv2DOperation +from .api import NpuConvDepthWiseOperation +from .api import NpuDataType +from .api import NpuDmaOperation +from .api import NpuElementWiseOp +from .api import NpuElementWiseOperation +from .api import NpuFeatureMap +from .api import NpuKernel +from .api import NpuLayout +from .api import NpuOperation +from .api import NpuPadding +from .api import NpuPoolingOp +from .api import NpuPoolingOperation +from .api import NpuQuantization +from .api import NpuResamplingMode +from .api import NpuRoundingMode +from .api import NpuShape3D +from .api import NpuTileBox +from .architecture_features import ArchitectureFeatures +from .data_type import DataType +from .high_level_command_stream import Box +from .high_level_command_stream import Command +from .high_level_command_stream import CommandType +from .high_level_command_stream import DMA +from .high_level_command_stream import NpuStripe +from .operation import Kernel +from .operation import NpuBlockType +from .operation import Op +from .operation import Operation +from .tensor import MemType +from .tensor import Tensor +from .tensor import TensorBlockTraversal +from .tensor import TensorFormat +from .tensor import TensorPurpose + + +unary_elementwise_ops = set((NpuElementWiseOp.ABS, NpuElementWiseOp.LRELU, NpuElementWiseOp.CLZ,)) + + +class BasePointerIndex(IntEnum): + WeightTensor = 0 # base address index for the Weight tensor + ScratchTensor = 1 # base address index for the Scratch_tensor in the TensorArena + ScratchFastTensor = 2 # base address for the Scratch_fast_tensor + Mem2Mem = (1 << 8) | (3 << 0) # base address slot for memory 2 memory transfer + + +dtype_map = { + DataType.uint8: NpuDataType.UINT8, + DataType.int8: NpuDataType.INT8, + DataType.uint16: NpuDataType.UINT16, + DataType.int16: NpuDataType.INT16, + DataType.int32: NpuDataType.INT32, +} + + +block_traversal_map = { + TensorBlockTraversal.DepthFirst: NpuBlockTraversal.DEPTH_FIRST, + TensorBlockTraversal.PartKernelFirst: NpuBlockTraversal.PART_KERNEL_FIRST, +} + + +# Maps an elementwise op type to an elementwise_mode enum value used by NPU_OP_ELEMENTWISE +elementwise_op_map = { + Op.Mul: NpuElementWiseOp.MUL, + Op.Add: NpuElementWiseOp.ADD, + Op.Sub: NpuElementWiseOp.SUB, + Op.Minimum: NpuElementWiseOp.MIN, + Op.Maximum: NpuElementWiseOp.MAX, + Op.LeakyRelu: NpuElementWiseOp.LRELU, + Op.Abs: NpuElementWiseOp.ABS, + Op.CLZ: NpuElementWiseOp.CLZ, + Op.SHR: NpuElementWiseOp.SHR, + Op.SHL: NpuElementWiseOp.SHL, +} + + +def to_npu_kernel(kernel: Kernel) -> NpuKernel: + """Converts the given internally used kernel object to NpuKernel (of public API)""" + return NpuKernel( + kernel.width, kernel.height, kernel.stride.x, kernel.stride.y, kernel.dilation.x, kernel.dilation.y + ) + + +def to_kernel(kernel: Optional[NpuKernel]) -> Kernel: + """Converts the given public API object to Kernel (used internally)""" + if kernel is None: + return Kernel(1, 1) + return Kernel(kernel.width, kernel.height, kernel.stride_x, kernel.stride_y, kernel.dilation_x, kernel.dilation_y) + + +def ifm_ifm2_correct_order(ifm_shape: List[int], ifm2_shape: List[int]) -> bool: + if ifm_shape == []: + # Scalar needs to be in IFM2 + return False + if ifm2_shape == []: + return True + + for ifm, ifm2 in zip(ifm_shape, ifm2_shape): + if ifm != ifm2 and ifm == 1: + # Broadcasted FM needs to be in IFM2 + return False + return True + + +def get_rounding_mode(op: Operation) -> NpuRoundingMode: + """Specifies type of rounding to be used""" + rounding_mode = NpuRoundingMode.TFL + if op.type == Op.ResizeBilinear: + rounding_mode = NpuRoundingMode.TRUNCATE + elif ( + op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise) + and op.ifm.dtype == DataType.int16 + ): + rounding_mode = NpuRoundingMode.NATURAL + elif op.type.is_avgpool_op() and op.memory_function == Op.ConcatSliceWrite and op.kernel.elements_wh() == 1: + rounding_mode = NpuRoundingMode.NATURAL + rounding_mode = op.attrs.get("rounding_mode", rounding_mode) + return rounding_mode + + +def create_padding(cmd: NpuStripe, primary_op: Operation) -> NpuPadding: + if primary_op.type.npu_block_type == NpuBlockType.VectorProduct: + return NpuPadding(top=0, left=0, bottom=0, right=0) + explicit_padding = list(primary_op.attrs["explicit_padding"]) # (top, left, bottom, right) + + # Check if this is for horizontal ifm streaming + if not (cmd.is_first_h_stripe and cmd.is_last_h_stripe): + explicit_padding[0] = cmd.pad_top + explicit_padding[2] = cmd.pad_bottom + + # Indexing from end since a 1x1 Avgpool might have been added with non 4-dimensional input/output, + # because of activation function needed to be fused. + if cmd.ifm_box.start_coord[-2] > 0: + explicit_padding[1] = 0 + if cmd.ifm_box.end_coord[-2] < cmd.ifm_tensor.shape[-2]: + explicit_padding[3] = 0 + return NpuPadding( + top=explicit_padding[0], left=explicit_padding[1], bottom=explicit_padding[2], right=explicit_padding[3] + ) + + +def get_region(tens: Tensor, arch: ArchitectureFeatures) -> int: + if arch.feature_map_storage_mem_area == arch.fast_storage_mem_area: + base_ptr_idx_map = { + MemType.Permanent_NPU: BasePointerIndex.WeightTensor, + MemType.Permanent_CPU: BasePointerIndex.WeightTensor, + MemType.Scratch: BasePointerIndex.ScratchTensor, + MemType.Scratch_fast: BasePointerIndex.ScratchTensor, + } + else: + base_ptr_idx_map = { + MemType.Permanent_NPU: BasePointerIndex.WeightTensor, + MemType.Permanent_CPU: BasePointerIndex.WeightTensor, + MemType.Scratch: BasePointerIndex.ScratchTensor, + MemType.Scratch_fast: BasePointerIndex.ScratchFastTensor, + } + return int(base_ptr_idx_map[tens.mem_type]) + + +def get_upscale(op: Operation) -> NpuResamplingMode: + upscale = NpuResamplingMode.NONE + if op.type == Op.ResizeBilinear: + # perform nearest neighbor upscale + upscale = NpuResamplingMode.NEAREST + elif op.type == Op.Conv2DBackpropInputSwitchedBias: + # perform insert zero upscale + upscale = NpuResamplingMode.TRANSPOSE + return upscale + + +def get_ifm_depth(npu_block_type: NpuBlockType, ifm_box: Box, ofm_box: Box) -> int: + if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): + shape = ifm_box.get_size_shape() + else: + shape = ofm_box.get_size_shape() + return shape[-1] + + +def use_zero_point_0(ps, tens: Tensor, is_ifm_tensor: bool) -> bool: + """Checks if quantization should use 0 as zero point""" + if tens.dtype == DataType.int32 and is_ifm_tensor: + return True + if ps.primary_op.type not in (Op.AvgPool, Op.ResizeBilinear, Op.CLZ, Op.SHL): + return False + fused_quantize = any(op.type == Op.Quantize for op in ps.ops) + forced_ofm_quantization = ps.primary_op.forced_output_quantization + use_0 = ( + (ps.primary_op.activation is None or forced_ofm_quantization is not None) + and (ps.primary_op.memory_function != Op.ConcatSliceWrite) + and not fused_quantize + ) + return use_0 + + +def get_ifm_or_ifm2_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]: + """Gets quantization for IFM/IFM2""" + if tens.quantization is None: + return None + if use_zero_point_0(ps, tens, True): + zero_point = 0 + else: + zero_point = int(tens.quantization.zero_point) + return NpuQuantization(scale_f32=tens.quantization.scale_f32, zero_point=zero_point) + + +def get_ofm_quantization(ps, tens: Tensor) -> Optional[NpuQuantization]: + """Gets quantization for OFM""" + op = ps.primary_op + # Check if operation's output quantization is should be used instead of the output tensor's quantization + # (used in LUTs) + ofm_quant = op.forced_output_quantization if op.forced_output_quantization is not None else tens.quantization + if ofm_quant is None: + return None + if use_zero_point_0(ps, tens, False): + zero_point = 0 + else: + zero_point = int(ofm_quant.zero_point) + return NpuQuantization(scale_f32=ofm_quant.scale_f32, zero_point=zero_point) + + +def create_feature_map(tens: Tensor, box: Box, arch: ArchitectureFeatures) -> NpuFeatureMap: + """Creates feature map with common fields populated""" + fm = NpuFeatureMap() + fm.region = get_region(tens, arch) + fm.data_type = dtype_map[tens.dtype] + if tens.format == TensorFormat.NHWC: + fm.layout = NpuLayout.NHWC + elif tens.format == TensorFormat.NHCWB16: + fm.layout = NpuLayout.NHCWB16 + else: + assert 0, "Incorrect tensor format" + height_0, height_1, width_0, addresses = tens.addresses_for_rolling_buffer(box.start_coord, box.end_coord) + for idx, addr in enumerate(addresses): + if addr is None: + addresses[idx] = 0 + fm.tiles = NpuTileBox( + height_0=height_0, height_1=height_1, width_0=width_0, addresses=[int(addr) for addr in addresses] + ) + strides = tens.get_strides() + fm.strides = NpuShape3D(height=int(strides[2]), width=int(strides[3]), depth=int(strides[1])) + return fm + + +def create_weights(weight_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures) -> List[NpuAddressRange]: + """Returns address ranges for weights""" + weights = [] + stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord) + weight_substream_offsets = weight_tensor.compressed_values_substream_offsets[stream_index] + substreams = len(weight_substream_offsets) - 1 # Offset list must terminate with full stream length + + # Extract weight substream offsets and calculate their lengths + assert len(weight_substream_offsets) > 1 and (weight_substream_offsets[0] == 0) + weight_addr = weight_tensor.address_for_coordinate(weight_box.start_coord) + region = get_region(weight_tensor, arch) + for core in range(substreams): + address = weight_addr + weight_substream_offsets[core] + length = weight_substream_offsets[core + 1] - weight_substream_offsets[core] + addr_range = NpuAddressRange(region, int(address), int(length)) + weights.append(addr_range) + return weights + + +def create_biases( + weight_tensor: Tensor, scale_tensor: Tensor, weight_box: Box, arch: ArchitectureFeatures +) -> List[NpuAddressRange]: + """Returns address ranges for biases""" + biases = [] + stream_index = weight_tensor.compressed_stream_index_from_coord(weight_box.start_coord) + scale_substream_offsets = scale_tensor.compressed_values_substream_offsets[stream_index] + substreams = len(scale_substream_offsets) - 1 # Offset list must terminate with full stream length + + # Extract scale substream offsets and calculate their lengths + assert len(scale_substream_offsets) > 1 and (scale_substream_offsets[0] == 0) + scale_addr = scale_tensor.address_for_coordinate(weight_box.start_coord[-1:]) + + region = get_region(scale_tensor, arch) + for core in range(substreams): + address = scale_addr + scale_substream_offsets[core] + length = scale_substream_offsets[core + 1] - scale_substream_offsets[core] + addr_range = NpuAddressRange(region, int(address), int(length)) + biases.append(addr_range) + return biases + + +def create_npu_activation(op: Operation) -> NpuActivation: + """Creates fused activation function""" + if op.activation is None: + return NpuActivation(NpuActivationOp.NONE_OR_RELU) + faf = op.activation.op_type + act_op = NpuActivationOp.NONE_OR_RELU + if faf == Op.Tanh: + act_op = NpuActivationOp.TANH + elif faf == Op.Sigmoid: + act_op = NpuActivationOp.SIGMOID + elif faf == Op.LUT: + act_op = NpuActivationOp.TABLE_LOOKUP + elif not faf.is_relu_op(): + raise Exception("Unsupported fused_activation_function = " + faf.name) + + act = NpuActivation(act_op) + act.min = op.activation.min + act.max = op.activation.max + act.lookup_table_index = op.activation.lut_index + return act + + +def set_common_op_fields(npu_op: NpuBlockOperation, cmd: NpuStripe, arch: ArchitectureFeatures): + """Sets common fields of the given operation""" + ps = cmd.ps + op = ps.primary_op + in_shape = cmd.ifm_box.get_size_shape() + out_shape = cmd.ofm_box.get_size_shape() + ofm_height = out_shape[-3] if len(out_shape) >= 4 else 1 + ofm_width = out_shape[-2] if len(out_shape) >= 2 else 1 + ofm_depth = out_shape[-1] if len(out_shape) >= 1 else 1 + ifm_height = in_shape[-3] if len(in_shape) >= 4 else 1 + if op.type.npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct, NpuBlockType.ReduceSum): + ifm_depth = in_shape[-1] if len(in_shape) >= 1 else 1 + else: + ifm_depth = ofm_depth + + npu_op.ifm = create_feature_map(cmd.ifm_tensor, cmd.ifm_box, arch) + npu_op.ifm.shape = NpuShape3D(height=ifm_height, width=cmd.ifm_tensor.shape[-2], depth=ifm_depth) + npu_op.ifm.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm_tensor) + npu_op.ofm = create_feature_map(cmd.ofm_tensor, cmd.ofm_box, arch) + npu_op.ofm.shape = NpuShape3D(height=ofm_height, width=ofm_width, depth=ofm_depth) + npu_op.ofm.quantization = get_ofm_quantization(ps, cmd.ofm_tensor) + + if cmd.weight_tensor is not None: + npu_op.weights = create_weights(cmd.weight_tensor, cmd.weight_box, arch) + if cmd.scale_tensor is not None: + npu_op.biases = create_biases(cmd.weight_tensor, cmd.scale_tensor, cmd.weight_box, arch) + npu_op.activation = create_npu_activation(op) + npu_op.rounding_mode = get_rounding_mode(op) + npu_op.block_config = NpuShape3D(height=ps.block_config[0], width=ps.block_config[1], depth=ps.block_config[3]) + + if not op.type.is_elementwise_op(): + npu_op.padding = create_padding(cmd, op) + npu_op.kernel = to_npu_kernel(op.kernel) + npu_op.ifm_upscale = get_upscale(op) + npu_op.fused_quantize = any(op.type == Op.Quantize for op in ps.ops) + return npu_op + + +def create_npu_conv2d_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConv2DOperation: + """Converts the command to NpuConv2DOperation""" + npu_op = NpuConv2DOperation() + set_common_op_fields(npu_op, cmd, arch) + if cmd.ps.primary_op.type.npu_block_type == NpuBlockType.VectorProduct: + npu_op.block_traversal = NpuBlockTraversal.DEPTH_FIRST + else: + npu_op.block_traversal = block_traversal_map[cmd.weight_tensor.block_traversal] + return npu_op + + +def create_npu_conv_depthwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuConvDepthWiseOperation: + """Converts the command to NpuConvDepthWiseOperation""" + npu_op = NpuConvDepthWiseOperation() + set_common_op_fields(npu_op, cmd, arch) + return npu_op + + +def create_npu_pool_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuPoolingOperation: + """Converts the command to NpuPoolingOperation""" + ps = cmd.ps + op = ps.primary_op + pool_op = NpuPoolingOp.AVERAGE + if op.type.is_maxpool_op(): + pool_op = NpuPoolingOp.MAX + elif op.type.is_avgpool_op() or op.type == Op.ResizeBilinear: + pool_op = NpuPoolingOp.AVERAGE + elif op.type == Op.ReduceSum: + pool_op = NpuPoolingOp.REDUCE_SUM + else: + assert 0, f"Unknown pool type {op.type}" + npu_op = NpuPoolingOperation(pool_op) + set_common_op_fields(npu_op, cmd, arch) + # Pooling specific info + if op.type == Op.ResizeBilinear and "rescale" in op.attrs: + npu_op.rescale = op.attrs["rescale"] + return npu_op + + +def create_npu_elementwise_op(cmd: NpuStripe, arch: ArchitectureFeatures) -> NpuElementWiseOperation: + """Converts the command to NpuElementWiseOperation""" + ps = cmd.ps + op = ps.primary_op + assert op.type in elementwise_op_map, f"Unknown elementwise type {op.type}" + elemwise_op = elementwise_op_map[op.type] + npu_op = NpuElementWiseOperation(elemwise_op) + if elemwise_op not in unary_elementwise_ops: + if not ifm_ifm2_correct_order(cmd.ifm_tensor.shape, cmd.ifm2_tensor.shape): + # The scalar/broadcasted feature map has to be the ifm2 tensor so switch the ifms + cmd.ifm_tensor, cmd.ifm2_tensor = cmd.ifm2_tensor, cmd.ifm_tensor + cmd.ifm_box, cmd.ifm2_box = cmd.ifm2_box, cmd.ifm_box + npu_op.reversed_operands = True + npu_op.ifm2 = create_feature_map(cmd.ifm2_tensor, cmd.ifm2_box, arch) + npu_op.ifm2.quantization = get_ifm_or_ifm2_quantization(ps, cmd.ifm2_tensor) + if cmd.ifm2_tensor.shape == []: + # scalar + assert cmd.ifm2_tensor.quant_values.size == 1 + npu_op.ifm2_scalar = cmd.ifm2_tensor.values.item(0) + npu_op.ifm2.shape = NpuShape3D(height=0, width=0, depth=0) + else: + box_shp = cmd.ifm2_box.get_size_shape() + height = box_shp[-3] if len(box_shp) >= 3 else 1 + npu_op.ifm2.shape = NpuShape3D(height=height, width=cmd.ifm2_tensor.shape[-2], depth=box_shp[-1]) + set_common_op_fields(npu_op, cmd, arch) + # Check if output scale needs to be overridden + output_scale = None + if op.type == Op.Add and "resizebilinear" in op.attrs: + # Force output scale same as the input scale for + # resizebilinear 1x1 that is converted to add + output_scale = npu_op.ifm2.quantization.scale_f32 + if op.type == Op.LeakyRelu: + output_scale = op.attrs["alpha"] + if op.type in (Op.Add, Op.Sub) and "rescale" in op.attrs: + npu_op.rescale = op.attrs.get("rescale") + if op.type in (Op.Add, Op.Mul, Op.Sub): + if op.activation is not None and op.activation.op_type in (Op.Sigmoid, Op.Tanh): + output_scale = 1 / 0x3000 + if output_scale is not None: + npu_op.ofm.quantization = NpuQuantization(scale_f32=output_scale, zero_point=npu_op.ofm.quantization.zero_point) + return npu_op + + +def create_dma_op(cmd: DMA, arch: ArchitectureFeatures) -> NpuDmaOperation: + """Converts the command to NpuDmaOperation""" + src_region = get_region(cmd.in_tensor, arch) + if cmd.out_tensor.purpose == TensorPurpose.LUT: + dest_region = BasePointerIndex.Mem2Mem + else: + dest_region = get_region(cmd.out_tensor, arch) + + start_coord = cmd.box.start_coord + src_addr = cmd.in_tensor.address_for_coordinate(start_coord) + dest_addr = cmd.out_tensor.address_for_coordinate(start_coord) + + if cmd.in_tensor.compressed_values is not None: + if cmd.out_tensor.purpose == TensorPurpose.FSBias: + sz = cmd.in_tensor.storage_size() + else: + stream_index = cmd.in_tensor.compressed_stream_index_from_coord(start_coord) + sz = cmd.in_tensor.size_of_compressed_stream(stream_index) + else: + sz = cmd.in_tensor.address_for_coordinate(cmd.box.end_coord, is_top_box=True) - src_addr + src = NpuAddressRange(src_region, int(src_addr), int(sz)) + dest = NpuAddressRange(dest_region, int(dest_addr), int(sz)) + return NpuDmaOperation(src, dest) + + +def convert_command_to_npu_op(cmd: Command, arch: ArchitectureFeatures) -> NpuOperation: + """Converts the high level command to NpuOperation""" + if cmd.cmdtype == CommandType.DMA: + npu_op = create_dma_op(cmd, arch) + elif cmd.cmdtype == CommandType.NpuStripe: + npu_block_type = cmd.ps.primary_op.type.npu_block_type + if npu_block_type in (NpuBlockType.ConvolutionMxN, NpuBlockType.VectorProduct): + npu_op = create_npu_conv2d_op(cmd, arch) + elif npu_block_type == NpuBlockType.ConvolutionDepthWise: + npu_op = create_npu_conv_depthwise_op(cmd, arch) + elif npu_block_type in (NpuBlockType.Pooling, NpuBlockType.ReduceSum): + npu_op = create_npu_pool_op(cmd, arch) + elif npu_block_type == NpuBlockType.ElementWise: + npu_op = create_npu_elementwise_op(cmd, arch) + else: + assert 0, f"Unknown command type {npu_block_type}" + # add a link to the high level command for debugging purposes + npu_op.cmd = cmd + return npu_op |