From cf7da10987cac3fc68cf180a9af665fe06d608fa Mon Sep 17 00:00:00 2001
From: Jacob Bohlin <jacob.bohlin@arm.com>
Date: Wed, 20 May 2020 09:03:40 +0200
Subject: MLBEDSW-1716: Transpose Convolution support

Change-Id: Ie6d8d6de9f3447f19ba06aafa9fa480fc96a973b
Signed-off-by: Jacob Bohlin <jacob.bohlin@arm.com>
---
 ethosu/vela/graph_optimiser.py                   | 53 +++++++++++++++++++++---
 ethosu/vela/mark_tensors.py                      |  4 +-
 ethosu/vela/operation.py                         |  3 ++
 ethosu/vela/pass_packing.py                      |  2 +-
 ethosu/vela/register_command_stream_generator.py |  8 +++-
 ethosu/vela/supported_operators.py               | 30 +++++++++++++-
 ethosu/vela/weight_compressor.py                 |  8 +++-
 7 files changed, 96 insertions(+), 12 deletions(-)

(limited to 'ethosu')

diff --git a/ethosu/vela/graph_optimiser.py b/ethosu/vela/graph_optimiser.py
index b004f4cc..ca8b89fc 100644
--- a/ethosu/vela/graph_optimiser.py
+++ b/ethosu/vela/graph_optimiser.py
@@ -131,12 +131,50 @@ def calc_padding_and_skirt(padding_type, kernel_size, stride, input_dims):
     skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
     return padding, skirt
 
+def calc_upscaled_padding_and_skirt(padding_type, kernel_size, stride, input_dims):
+    upscaled_shape = [input_dims[0], input_dims[1] * stride[1], input_dims[2] * stride[2], input_dims[3]]
+    ypad = needed_total_padding(int(upscaled_shape[1]), int(stride[1]), int(kernel_size[0]))
+    xpad = needed_total_padding(int(upscaled_shape[2]), int(stride[2]), int(kernel_size[1]))
+
+    if padding_type == b"SAME":
+        right_pad = ((xpad + 1) // 2) - 1
+        bottom_pad = ((ypad + 1) // 2) - 1
+        left_pad = max(kernel_size[0] - 1 - right_pad, 0)
+        top_pad = max(kernel_size[1] - 1 - bottom_pad, 0)
+    elif padding_type == b"VALID":
+        right_pad = (xpad + 1) // 2
+        bottom_pad = (ypad + 1) // 2
+        left_pad = max(kernel_size[0] - right_pad, 0)
+        top_pad = max(kernel_size[1] - bottom_pad, 0)
+    else:
+        assert 0, "Unknown padding"
+
+    padding = (top_pad, left_pad, bottom_pad, right_pad)
+    skirt = (top_pad, left_pad, ypad - top_pad, xpad - left_pad)
+    return padding, skirt
+
 
 def fixup_conv2d_backprop(op, arch):
     if op.type == "Conv2DBackpropInput":
         # flip the inputs
         op.inputs[0], op.inputs[2] = op.inputs[2], op.inputs[0]
-        op.type = "Conv2DBackpropInputSwitched"
+        op.type = "Conv2DBackpropInputSwitchedBias"
+        weight_shape = op.inputs[1].shape
+        weight_sets = weight_shape[3]
+
+        if len(op.inputs) < 4:
+            # Add bias/scale tensor filled with zeros
+            scale_op = Operation("Const", op.name + "_bias")
+            scale_tens = Tensor([weight_sets], DataType.int32, op.name + "_bias_tens")
+            scale_tens.values = [0] * weight_sets
+            scale_tens.quant_values = [0] * weight_sets
+            scale_tens.ops = [scale_op]
+            scale_op.outputs = [scale_tens]
+            scale_tens.consumer_list = [op]
+            op.inputs.append(scale_tens)
+
+        # Update strides
+        op.attrs.update( {"stride_w": 1, "stride_h": 1, "strides": (1,1,1,1)} )
 
     return op
 
@@ -292,15 +330,20 @@ def add_padding_fields(op, arch):
         else:
             raise UnsupportedFeatureError("Unknown operation that uses padding: {}".format(op.type))
 
-        dilation_h, dilation_w = op.get_dilation_h_w()
-        dilated_kernel_size = [dilation_h * (kernel_size[0] - 1) + 1, dilation_w * (kernel_size[1] - 1) + 1]
-        padding, skirt = calc_padding_and_skirt(op.attrs["padding"], dilated_kernel_size, op.attrs["strides"], input_shape)
+        if op.type == "Conv2DBackpropInputSwitchedBias":
+            padding, skirt = calc_upscaled_padding_and_skirt(op.attrs["padding"], kernel_size, op.attrs["strides"], input_shape)
+        else:
+            dilation_h, dilation_w = op.get_dilation_h_w()
+            dilated_kernel_size = [dilation_h * (kernel_size[0] - 1) + 1, dilation_w * (kernel_size[1] - 1) + 1]
+            padding, skirt = calc_padding_and_skirt(op.attrs["padding"], dilated_kernel_size, op.attrs["strides"], input_shape)
+
         op.attrs["explicit_padding"] = padding
         op.attrs["skirt"] = skirt
+
     return op
 
 
-conv_op = set(("Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched", "Conv2DBiasAct"))
+conv_op = set(("Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitchedBias", "Conv2DBiasAct"))
 fc_op = set(
     (
         "MatMul",
diff --git a/ethosu/vela/mark_tensors.py b/ethosu/vela/mark_tensors.py
index bf7bc45f..5231e860 100644
--- a/ethosu/vela/mark_tensors.py
+++ b/ethosu/vela/mark_tensors.py
@@ -111,8 +111,8 @@ tensor_purposes = [  # ops, input_purpose
         purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
     ),
     (
-        set(("Conv2DBackpropInputSwitched",)),
-        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap]),
+        set(("Conv2DBackpropInputSwitchedBias",)),
+        purpose_from_list([TensorPurpose.FeatureMap, TensorPurpose.Weights, TensorPurpose.FeatureMap, TensorPurpose.FeatureMap]),
     ),
     (
         set(("QuantizedConv2D", "QuantizedMatMul")),
diff --git a/ethosu/vela/operation.py b/ethosu/vela/operation.py
index e8a03b7d..51311ef7 100644
--- a/ethosu/vela/operation.py
+++ b/ethosu/vela/operation.py
@@ -75,6 +75,9 @@ input and output tensors, as well as an attribute dictionary."""
                 if len(self.inputs) >= 3:
                     bias_idx = 2
 
+            elif self.type == "Conv2DBackpropInputSwitchedBias":
+                bias_idx = 3
+
         elif npu_block_type == NpuBlockType.Pooling:
             ifm_idx = 0
             ofm_idx = 0
diff --git a/ethosu/vela/pass_packing.py b/ethosu/vela/pass_packing.py
index 9bf080e6..5841ca23 100644
--- a/ethosu/vela/pass_packing.py
+++ b/ethosu/vela/pass_packing.py
@@ -48,7 +48,7 @@ mac_main_ops = set(
         "Conv2DBiasAct",
         "Conv2D",
         "QuantizedConv2D",
-        "Conv2DBackpropInputSwitched",
+        "Conv2DBackpropInputSwitchedBias",
         # depth-wise convolutions
         "DepthwiseConv2dBiasAct",
         "DepthwiseConv2dNative",
diff --git a/ethosu/vela/register_command_stream_generator.py b/ethosu/vela/register_command_stream_generator.py
index 3da8bbcf..11c0c20d 100644
--- a/ethosu/vela/register_command_stream_generator.py
+++ b/ethosu/vela/register_command_stream_generator.py
@@ -38,6 +38,7 @@ from .ethos_u55_regs.ethos_u55_regs import cmd0
 from .ethos_u55_regs.ethos_u55_regs import cmd1
 from .ethos_u55_regs.ethos_u55_regs import elementwise_mode
 from .ethos_u55_regs.ethos_u55_regs import ifm_precision
+from .ethos_u55_regs.ethos_u55_regs import resampling_mode
 from .ethos_u55_regs.ethos_u55_regs import rounding
 from .high_level_command_stream import CommandType
 from .numeric_util import clamp_sigmoid
@@ -555,9 +556,12 @@ def generate_register_command_stream(nng, sg, arch, verbose=False):
 
             if primary_op.type == "ResizeBilinear":
                 # perform nearest neighbor upscale
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 1)
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NEAREST)
+            elif primary_op.type == "Conv2DBackpropInputSwitchedBias":
+                # perform insert zero upscale
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.TRANSPOSE)
             else:
-                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, 0)
+                emit.cmd0_with_param(cmd0.NPU_SET_IFM_UPSCALE, resampling_mode.NONE)
 
             if npu_block_type in set(
                 (NpuBlockType.ConvolutionMxN, NpuBlockType.ConvolutionDepthWise, NpuBlockType.Pooling)
diff --git a/ethosu/vela/supported_operators.py b/ethosu/vela/supported_operators.py
index 729d435a..cbd5d6cc 100644
--- a/ethosu/vela/supported_operators.py
+++ b/ethosu/vela/supported_operators.py
@@ -22,10 +22,11 @@ class SupportedOperators:
     def __init__(self):
         # Categorised lists of supported operators
         self.npu_pre_ops = set(("QuantizedResizeBilinear", "SplitSliceRead"))
-        self.convolution_ops = set(("Conv2DBiasAct", "Conv2D", "QuantizedConv2D", "Conv2DBackpropInputSwitched"))
+        self.convolution_ops = set(("Conv2DBiasAct", "Conv2D", "QuantizedConv2D"))
         self.depthwise_convolution_ops = set(
             ("DepthwiseConv2dBiasAct", "DepthwiseConv2dNative", "QuantizedDepthwiseConv2D")
         )
+        self.transpose_convolution_ops = set(("Conv2DBackpropInput",))
         self.max_pooling_ops = set(("QuantizedMaxPool", "MaxPool", "MaxPoolAct"))
         self.avg_pooling_ops = set(("QuantizedAvgPool", "AvgPool", "AvgPoolAct"))
         self.pooling_ops = self.max_pooling_ops | self.avg_pooling_ops
@@ -36,6 +37,8 @@ class SupportedOperators:
             self.convolution_ops
             # depth-wise convolutions
             | self.depthwise_convolution_ops
+            # transpose convolutions
+            | self.transpose_convolution_ops
             # pooling
             | self.pooling_ops
             # resizing/upscaling
@@ -90,6 +93,9 @@ class SupportedOperators:
         self.supported_operator_restrictions.update(
             {op: self.check_depthwise_convolution_restrictions for op in self.depthwise_convolution_ops}
         )
+        self.supported_operator_restrictions.update(
+            {op: self.check_transpose_convolution_restrictions for op in self.transpose_convolution_ops}
+        )
         self.supported_operator_restrictions.update({op: self.check_pooling_restrictions for op in self.pooling_ops})
         self.supported_operator_restrictions.update({op: self.check_resize_restrictions for op in self.resizing_ops})
         self.supported_operator_restrictions.update(
@@ -180,6 +186,28 @@ class SupportedOperators:
             return False
         return self.check_convolution_restrictions(op)
 
+    def check_transpose_convolution_restrictions(self, op):
+        # check stride
+        stride_h, stride_w = op.attrs["stride_h"], op.attrs["stride_w"]
+        if stride_h != stride_w != 2:
+            return False
+
+        # check output dimensions
+        ifm_tensor, weight_tensor, _, ofm_tensor = op.get_ifm_weights_biases_ofm()
+        ifm_h, ifm_w = ifm_tensor.shape[1], ifm_tensor.shape[2]
+        ofm_h, ofm_w = ofm_tensor.shape[1], ofm_tensor.shape[2]
+        if op.attrs["padding"] == b"SAME":
+            if (ofm_h != ifm_h * stride_h) or (ofm_w != ifm_w * stride_w):
+                return False
+        elif op.attrs["padding"] == b"VALID":
+            kernel_h, kernel_w = weight_tensor.shape[0], weight_tensor.shape[1]
+            if ((ofm_h != (ifm_h) * stride_h + max(kernel_h - stride_h, 0))
+                or (ofm_w != (ifm_w) * stride_w + max(kernel_w - stride_w, 0))):
+                return False
+
+        return self.check_convolution_restrictions(op)
+
+
     def check_pooling_restrictions(self, op):
         # check stride
         if op.attrs["stride_w"] > 3 or op.attrs["stride_h"] > 3:
diff --git a/ethosu/vela/weight_compressor.py b/ethosu/vela/weight_compressor.py
index 9edde601..df2b0573 100644
--- a/ethosu/vela/weight_compressor.py
+++ b/ethosu/vela/weight_compressor.py
@@ -234,6 +234,10 @@ def compress_weights(arch, nng, tens, npu_block_type, ofm_block_depth, ofm_depth
         else:
             tens.block_traversal = TensorBlockTraversal.DepthFirst
 
+    if tens.consumer_list[0].type == "Conv2DBackpropInputSwitchedBias":
+        # Transpose Convoluion, reverse weights in H and W axes
+        weights = np.flip(weights, axis=(0,1))
+
     # Slice weight stream up depth-ways into bricks and compress
     full_ofm_depth = quant_buf.shape[-1]
     for idx in range(0, full_ofm_depth, ofm_depth_step):
@@ -273,7 +277,9 @@ def calc_scales_and_pack_biases(tens, arch, oc_quantum, rescale_for_faf=False):
     # the connected operator should expect a bias input unless it is a FullyConnected
     assert "Bias" in tens.consumer_list[0].type or tens.consumer_list[0].type.startswith("FullyConnected")
     # the input bias tensor is the same as that connected to the operator
-    assert tens is tens.consumer_list[0].inputs[2]
+    _, _, bias_tens, _ = tens.consumer_list[0].get_ifm_weights_biases_ofm()
+    assert tens is bias_tens
+
     # the operator should only have a single output
     assert len(tens.consumer_list[0].outputs) == 1
 
-- 
cgit v1.2.1