1 files changed, 160 insertions, 159 deletions
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index 571b9aa..341f51d 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -18,9 +18,9 @@ This returns the index with the largest value across the given axis of the input
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t*|input|input_shape|Input tensor dimension k \<=4
+|Input|in_t*|input|shape1|Input tensor dimension k \<=4
 |Attribute|int|axis|-|Axis in range 0 to k-1
-|Output|out_t*|output|output_shape|Output tensor dimension k-1
+|Output|out_t*|output|shape|Output tensor dimension k-1
 |===
 
 *Quantization Parameters:*
@@ -31,20 +31,20 @@ None
 
 [source,c]
 ----
-assert(axis >= 0 && axis < k && k <=4)
-left_shape = input_shape[0:axis-1]
-right_shape = input_shape[axis+1:k-1]
-assert( concat(left_shape, right_shape) == output_shape )
-for_each ( left_index in left_shape, right_index in right_shape )
-    in_t max_value = minimum_value<in_t>
-    int32 max_index = 0;
-    for (i=0; i<shape[axis]; i++) {
-        index = concat(left_index, [i], right_index)
-        in_t value = tensor_read<in_t>(input, input_shape, index)
+assert(axis >= 0 && axis < k && k <=4);
+left_shape = shape1[0:axis-1];
+right_shape = shape1[axis+1:k-1];
+assert(flatten(left_shape, right_shape) == shape);
+for_each(left_index in left_shape, right_index in right_shape )
+    in_t max_value = minimum_value<in_t>;
+    int32_t max_index = 0;
+    for (i = 0; i < shape[axis]; i++) {
+        index = flatten(left_index, [i], right_index);
+        in_t value = tensor_read<in_t>(input, shape1, index);
         if (value > max_value) { max_value = value; max_index=i; }
     }
-    index = concat(left_index, right_index)
-    tensor_write<int32_t>(output, output_shape, index, max_index)
+    index = flatten(left_index, right_index);
+    tensor_write<int32_t>(output, shape, index, max_index);
 }
 ----
 
@@ -53,9 +53,9 @@ for_each ( left_index in left_shape, right_index in right_shape )
 |===
 |Profile|Mode|in_t|out_t
 
-|Any|signed 8|int8|int32
-|Any|signed 16|int16|int32
-|MI, MT|float|float|int32
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|MI, MT|floating-point|float_t|int32_t
 |===
 
 ==== AVG_POOL2D
@@ -67,11 +67,11 @@ This performs an average pooling over the given input tensor. A sliding window o
 |===
 |Argument|Type|Name|Shape|Description
 
-|Input|in_t *|input|[N,H,W,C]|Input tensor 4D
-|Attribute|int *|kernel|[2]|[kernel_y, kernel_x]
-|Attribute|int *|stride|[2]|[stride_y, stride_x]
-|Attribute|int *|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Output|out_t *|output|[N,H,W,C]|Output tensor 4D
+|Input|in_t*|input|[N,H,W,C]|Input tensor 4D
+|Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
+|Attribute|int*|stride|[2]|[stride_y, stride_x]
+|Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
+|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
 |===
 
 *Quantization Parameters:*
@@ -80,46 +80,47 @@ This performs an average pooling over the given input tensor. A sliding window o
 |Argument|Type|Name|Shape|Description
 
 |Attribute|in_t|input_zp|-|Input tensor zero point
-|Attribute|out_t|output_zp|-|Output tensor zero point
+|Attribute|in_t|output_zp|-|Output tensor zero point
 |===
 
 *Operation Function:*
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(out_t == int8_t || output_zp == 0) // Zero point only for int8
-pad=concat([0,0],pad,[0,0])
-for_each ( 0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(in_t == int8_t || output_zp == 0); // Zero point only for int8_t
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+    in_t output_val;
     acc_t acc = 0;
     int count = 0;
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each ( 0 <= ky < kernel_y, 0 <= kx < kernel_x) {
-        y = iy + ky
-        x = ix + kx
-        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad)
-        acc = apply_add<acc_t>(acc, value)
-        if (0<=y<IH and 0<=x<IW) count++
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= ky < kernel_y, 0 <= kx < kernel_x) {
+        y = iy + ky;
+        x = ix + kx;
+        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad);
+        acc = apply_add<acc_t>(acc, value);
+        if (0 <= y < IH and 0 <= x < IW) count++
     }
     if (is_float(out_t)) {
-      value = value / (float)count;
+        output_val = acc / (float)count;
     } else {
-      scale_t scale = reciprocal_scale(count)
-      acc = apply_scale_32(acc, scale.multiplier, scale.shift, false)
-      acc = apply_clip(acc + output_zp, output_min, output_max)
+        scale_t scale = reciprocal_scale(count);
+        acc = apply_scale_32(acc, scale.multiplier, scale.shift, false);
+        output_val = apply_clip<in_t>(acc + output_zp, minimum<in_t>, maximum<in_t>)
     }
-    tensor_write<out_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+    tensor_write<in_t>(output, [N,H,W,OC], [n,oy,ox,oc], output_val);
 }
 ----
 
 *Supported Data Types:*
 |===
-|Profile|Mode|in_t|acc_t|out_t
+|Profile|Mode|in_t|acc_t
 
-|Any|signed 8|int8|int32_t|int8
-|Any|signed 16|int16|int32_t|int16
-|MI, MT|float|float|float|float
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== CONV2D
@@ -153,22 +154,22 @@ Performs a 2D convolution over the given tensor input, using the weight tensor.
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
-    acc_t acc = 0
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each (0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
-        y = iy + ky * dilation_y
-        x = ix + kx * dilation_x
-        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad)
-        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
+    acc_t acc = 0;
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+        y = iy + ky * dilation_y;
+        x = ix + kx * dilation_x;
+        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad);
+        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[oc])
-    tensor_write<acc_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+    acc = apply_add<acc_t>(acc, bias[oc]);
+    tensor_write<acc_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
 }
 ----
 
@@ -177,10 +178,10 @@ for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 ==== CONV3D
@@ -214,24 +215,24 @@ Performs a 3D convolution over the given input tensor.
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
-    acc_t acc = 0
-    id = od * stride_d - pad_d0
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each (0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
-        d = id + kd * dilation_d
-        y = iy + ky * dilation_y
-        x = ix + kx * dilation_x
-        in_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad)
-        weight_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
+    acc_t acc = 0;
+    id = od * stride_d - pad_d0;
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+        d = id + kd * dilation_d;
+        y = iy + ky * dilation_y;
+        x = ix + kx * dilation_x;
+        in_t value  = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad);
+        weight_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[oc])
-    tensor_write<acc_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc)
+    acc = apply_add<acc_t>(acc, bias[oc]);
+    tensor_write<acc_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc);
 }
 ----
 
@@ -240,10 +241,10 @@ for_each (0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 
@@ -278,22 +279,22 @@ Performs 2D convolutions separately over each channel of the given tensor input,
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
-    acc_t acc = 0
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each (0 <= ky < KH, 0 <= kx < KW) {
-        y = iy + ky * dilation_y
-        x = ix + kx * dilation_x
-        in_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad)
-        weight_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
+    acc_t acc = 0;
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each(0 <= ky < KH, 0 <= kx < KW) {
+        y = iy + ky * dilation_y;
+        x = ix + kx * dilation_x;
+        in_t value  = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad);
+        weight_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[(c*M) + m])
-    tensor_write<acc_t>(output, [N,H,W,C*M], [n,oy,ox,c*M+m], acc)
+    acc = apply_add<acc_t>(acc, bias[(c * M) + m]);
+    tensor_write<acc_t>(output, [N,H,W,C * M], [n,oy,ox,c * M + m], acc);
 }
 ----
 
@@ -302,10 +303,10 @@ for_each (0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 ==== FULLY_CONNECTED
@@ -336,17 +337,17 @@ Performs a fully connected network.
 
 [source,c]
 ----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-for_each (0 <= n < N, 0 <= oc < OC) {
-    acc_t acc = 0
-    for_each (0 <= ic < IC) {
-        in_t value  = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp)
-        weight_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+for_each(0 <= n < N, 0 <= oc < OC) {
+    acc_t acc = 0;
+    for_each(0 <= ic < IC) {
+        in_t value  = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp);
+        weight_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
     }
-    acc = apply_add<acc_t>(acc, bias[oc])
-    tensor_write<acc_t>(output, [N,OC], [n,oc], acc)
+    acc = apply_add<acc_t>(acc, bias[oc]);
+    tensor_write<acc_t>(output, [N,OC], [n,oc], acc);
 }
 ----
 
@@ -355,10 +356,10 @@ for_each (0 <= n < N, 0 <= oc < OC) {
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8 |int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8 |int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===
 
 ==== MATMUL
@@ -387,15 +388,15 @@ Performs two dimensional matrix multiplications. This allows both inputs to be a
 
 [source,c]
 ----
-assert(in_t == int8_t || (A_zp == 0 && B_zp == 0)) // Zero point only for int8
-for_each (0 <= n < N, 0 <= h < H, 0 <= w < W) {
-    acc_t acc = 0
-    for_each (0 <= c < C) {
-        in_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp)
-        in_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp)
-        acc = apply_add<acc_t>(acc, value1 * value2)
+assert(in_t == int8_t || (A_zp == 0 && B_zp == 0)); // Zero point only for int8_t
+for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) {
+    acc_t acc = 0;
+    for_each(0 <= c < C) {
+        in_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp);
+        in_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp);
+        acc = apply_add<acc_t>(acc, value1 * value2);
     }
-    tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc)
+    tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc);
 }
 ----
 
@@ -404,9 +405,9 @@ for_each (0 <= n < N, 0 <= h < H, 0 <= w < W) {
 |===
 |Profile|Mode|in_t|acc_t
 
-|Any|signed 8x8|int8|int32
-|Any|signed 16x16|int16|int48
-|MI, MT|float|float|float
+|Any|signed 8x8|int8_t|int32_t
+|Any|signed 16x16|int16_t|int48_t
+|MI, MT|floating-point|float_t|float_t
 |===
 
 ==== MAX_POOL2D
@@ -421,7 +422,7 @@ This performs a max pooling over the given input tensor. A sliding window of siz
 |Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
 |Attribute|int*|stride|[2]|[stride_y, stride_x]
 |Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Output|out_t*|output|[N,H,W,C]|Output tensor 4D
+|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
 |===
 
 *Quantization Parameters:*
@@ -432,29 +433,29 @@ None
 
 [source,c]
 ----
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
     in_t acc = minimum_value<in_t>;
-    iy = oy * stride_y - pad_top
-    ix = ox * stride_x - pad_left
-    for_each ( 0<=ky<kernel_y, 0<=kx<kernel_x ) {
-        y = iy + ky
-        x = ix + kx
-        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], pad)
-        acc = apply_max(acc, value)
+    iy = oy * stride_y - pad_top;
+    ix = ox * stride_x - pad_left;
+    for_each( 0<=ky<kernel_y, 0<=kx<kernel_x ) {
+        y = iy + ky;
+        x = ix + kx;
+        in_t value  = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], pad);
+        acc = apply_max(acc, value);
     }
-    tensor_write<out_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+    tensor_write<in_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
 }
 ----
 
 *Supported Data Types:*
 
 |===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
 
-|Any|signed 8|int8|int8
-|Any|16-bit|int16|int16
-|MI, MT|float|float|float
+|Any|signed 8|int8_t
+|Any|16-bit|int16_t
+|MI, MT|floating-point|float_t
 |===
 
 ==== TRANSPOSE_CONV2D
@@ -488,21 +489,21 @@ Performs a 2D transposed convolution over the given tensor input, using the weig
 
 [source,c]
 ----
-assert(in_t == int8_t  || input_zp == 0) // Zero point only allowed for int8
-assert(weight_t == int8_t || weight_zp == 0)
-for_each (index in out_shape) {
+assert(in_t == int8_t  || input_zp == 0); // Zero point only allowed for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+for_each(index in out_shape) {
     tensor_write<acc_t>(output, [N,OH,OW,OC], index, bias[index[3]])
 }
-for_each (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
+for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
           0 <= ic < IC, 0 <= ky < KH,  0 <= kx < KW) {
-    oy = iy * stride_y - out_pad_top  + ky
-    ox = ix * stride_x - out_pad_left + kx
-    if (oy>=0 && oy<OH && ox>=0 && ox<OW) {
-        acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc])
-        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp)
-        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp)
-        acc = apply_add<acc_t>(acc, value * weight)
-        tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc)
+    oy = iy * stride_y - out_pad_top  + ky;
+    ox = ix * stride_x - out_pad_left + kx;
+    if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
+        acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
+        in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp);
+        weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+        acc = apply_add<acc_t>(acc, value * weight);
+        tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
     }
 }
 ----
@@ -512,8 +513,8 @@ for_each (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
 |===
 |Profile|Mode|in_t|weight_t|acc_t
 
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
 |===