aboutsummaryrefslogtreecommitdiff
path: root/chapters/tensor_ops.adoc
diff options
context:
space:
mode:
Diffstat (limited to 'chapters/tensor_ops.adoc')
-rw-r--r--chapters/tensor_ops.adoc319
1 files changed, 160 insertions, 159 deletions
diff --git a/chapters/tensor_ops.adoc b/chapters/tensor_ops.adoc
index 571b9aa..341f51d 100644
--- a/chapters/tensor_ops.adoc
+++ b/chapters/tensor_ops.adoc
@@ -18,9 +18,9 @@ This returns the index with the largest value across the given axis of the input
|===
|Argument|Type|Name|Shape|Description
-|Input|in_t*|input|input_shape|Input tensor dimension k \<=4
+|Input|in_t*|input|shape1|Input tensor dimension k \<=4
|Attribute|int|axis|-|Axis in range 0 to k-1
-|Output|out_t*|output|output_shape|Output tensor dimension k-1
+|Output|out_t*|output|shape|Output tensor dimension k-1
|===
*Quantization Parameters:*
@@ -31,20 +31,20 @@ None
[source,c]
----
-assert(axis >= 0 && axis < k && k <=4)
-left_shape = input_shape[0:axis-1]
-right_shape = input_shape[axis+1:k-1]
-assert( concat(left_shape, right_shape) == output_shape )
-for_each ( left_index in left_shape, right_index in right_shape )
- in_t max_value = minimum_value<in_t>
- int32 max_index = 0;
- for (i=0; i<shape[axis]; i++) {
- index = concat(left_index, [i], right_index)
- in_t value = tensor_read<in_t>(input, input_shape, index)
+assert(axis >= 0 && axis < k && k <=4);
+left_shape = shape1[0:axis-1];
+right_shape = shape1[axis+1:k-1];
+assert(flatten(left_shape, right_shape) == shape);
+for_each(left_index in left_shape, right_index in right_shape )
+ in_t max_value = minimum_value<in_t>;
+ int32_t max_index = 0;
+ for (i = 0; i < shape[axis]; i++) {
+ index = flatten(left_index, [i], right_index);
+ in_t value = tensor_read<in_t>(input, shape1, index);
if (value > max_value) { max_value = value; max_index=i; }
}
- index = concat(left_index, right_index)
- tensor_write<int32_t>(output, output_shape, index, max_index)
+ index = flatten(left_index, right_index);
+ tensor_write<int32_t>(output, shape, index, max_index);
}
----
@@ -53,9 +53,9 @@ for_each ( left_index in left_shape, right_index in right_shape )
|===
|Profile|Mode|in_t|out_t
-|Any|signed 8|int8|int32
-|Any|signed 16|int16|int32
-|MI, MT|float|float|int32
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|MI, MT|floating-point|float_t|int32_t
|===
==== AVG_POOL2D
@@ -67,11 +67,11 @@ This performs an average pooling over the given input tensor. A sliding window o
|===
|Argument|Type|Name|Shape|Description
-|Input|in_t *|input|[N,H,W,C]|Input tensor 4D
-|Attribute|int *|kernel|[2]|[kernel_y, kernel_x]
-|Attribute|int *|stride|[2]|[stride_y, stride_x]
-|Attribute|int *|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Output|out_t *|output|[N,H,W,C]|Output tensor 4D
+|Input|in_t*|input|[N,H,W,C]|Input tensor 4D
+|Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
+|Attribute|int*|stride|[2]|[stride_y, stride_x]
+|Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
+|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
|===
*Quantization Parameters:*
@@ -80,46 +80,47 @@ This performs an average pooling over the given input tensor. A sliding window o
|Argument|Type|Name|Shape|Description
|Attribute|in_t|input_zp|-|Input tensor zero point
-|Attribute|out_t|output_zp|-|Output tensor zero point
+|Attribute|in_t|output_zp|-|Output tensor zero point
|===
*Operation Function:*
[source,c]
----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(out_t == int8_t || output_zp == 0) // Zero point only for int8
-pad=concat([0,0],pad,[0,0])
-for_each ( 0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(in_t == int8_t || output_zp == 0); // Zero point only for int8_t
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+ in_t output_val;
acc_t acc = 0;
int count = 0;
- iy = oy * stride_y - pad_top
- ix = ox * stride_x - pad_left
- for_each ( 0 <= ky < kernel_y, 0 <= kx < kernel_x) {
- y = iy + ky
- x = ix + kx
- in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad)
- acc = apply_add<acc_t>(acc, value)
- if (0<=y<IH and 0<=x<IW) count++
+ iy = oy * stride_y - pad_top;
+ ix = ox * stride_x - pad_left;
+ for_each(0 <= ky < kernel_y, 0 <= kx < kernel_x) {
+ y = iy + ky;
+ x = ix + kx;
+ in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], input_zp, pad);
+ acc = apply_add<acc_t>(acc, value);
+ if (0 <= y < IH and 0 <= x < IW) count++
}
if (is_float(out_t)) {
- value = value / (float)count;
+ output_val = acc / (float)count;
} else {
- scale_t scale = reciprocal_scale(count)
- acc = apply_scale_32(acc, scale.multiplier, scale.shift, false)
- acc = apply_clip(acc + output_zp, output_min, output_max)
+ scale_t scale = reciprocal_scale(count);
+ acc = apply_scale_32(acc, scale.multiplier, scale.shift, false);
+ output_val = apply_clip<in_t>(acc + output_zp, minimum<in_t>, maximum<in_t>)
}
- tensor_write<out_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+ tensor_write<in_t>(output, [N,H,W,OC], [n,oy,ox,oc], output_val);
}
----
*Supported Data Types:*
|===
-|Profile|Mode|in_t|acc_t|out_t
+|Profile|Mode|in_t|acc_t
-|Any|signed 8|int8|int32_t|int8
-|Any|signed 16|int16|int32_t|int16
-|MI, MT|float|float|float|float
+|Any|signed 8|int8_t|int32_t
+|Any|signed 16|int16_t|int32_t
+|MI, MT|floating-point|float_t|float_t
|===
==== CONV2D
@@ -153,22 +154,22 @@ Performs a 2D convolution over the given tensor input, using the weight tensor.
[source,c]
----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
- acc_t acc = 0
- iy = oy * stride_y - pad_top
- ix = ox * stride_x - pad_left
- for_each (0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
- y = iy + ky * dilation_y
- x = ix + kx * dilation_x
- in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad)
- weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp)
- acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
+ acc_t acc = 0;
+ iy = oy * stride_y - pad_top;
+ ix = ox * stride_x - pad_left;
+ for_each(0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+ y = iy + ky * dilation_y;
+ x = ix + kx * dilation_x;
+ in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,ic], input_zp, pad);
+ weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+ acc = apply_add<acc_t>(acc, value * weight);
}
- acc = apply_add<acc_t>(acc, bias[oc])
- tensor_write<acc_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+ acc = apply_add<acc_t>(acc, bias[oc]);
+ tensor_write<acc_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
}
----
@@ -177,10 +178,10 @@ for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
|===
|Profile|Mode|in_t|weight_t|acc_t
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
|===
==== CONV3D
@@ -214,24 +215,24 @@ Performs a 3D convolution over the given input tensor.
[source,c]
----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
- acc_t acc = 0
- id = od * stride_d - pad_d0
- iy = oy * stride_y - pad_top
- ix = ox * stride_x - pad_left
- for_each (0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
- d = id + kd * dilation_d
- y = iy + ky * dilation_y
- x = ix + kx * dilation_x
- in_t value = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad)
- weight_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp)
- acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
+ acc_t acc = 0;
+ id = od * stride_d - pad_d0;
+ iy = oy * stride_y - pad_top;
+ ix = ox * stride_x - pad_left;
+ for_each(0 <= kd < KD, 0 <= ky < KH, 0 <= kx < KW, 0 <= ic < IC) {
+ d = id + kd * dilation_d;
+ y = iy + ky * dilation_y;
+ x = ix + kx * dilation_x;
+ in_t value = tensor_read<in_t>(input, [N,ID,IH,IW,IC], [n,d,y,x,ic], input_zp, pad);
+ weight_t weight = tensor_read<weight_t>(weight,[OC,KD,KH,KW,IC],[oc,kd,ky,kx,ic], weight_zp);
+ acc = apply_add<acc_t>(acc, value * weight);
}
- acc = apply_add<acc_t>(acc, bias[oc])
- tensor_write<acc_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc)
+ acc = apply_add<acc_t>(acc, bias[oc]);
+ tensor_write<acc_t>(output, [N,D,H,W,OC], [n,od,oy,ox,oc], acc);
}
----
@@ -240,10 +241,10 @@ for_each (0 <= n < N, 0 <= od < D, 0 <= oy < H, 0 <= ox < W; 0 <= oc < OC) {
|===
|Profile|Mode|in_t|weight_t|acc_t
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
|===
@@ -278,22 +279,22 @@ Performs 2D convolutions separately over each channel of the given tensor input,
[source,c]
----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
- acc_t acc = 0
- iy = oy * stride_y - pad_top
- ix = ox * stride_x - pad_left
- for_each (0 <= ky < KH, 0 <= kx < KW) {
- y = iy + ky * dilation_y
- x = ix + kx * dilation_x
- in_t value = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad)
- weight_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp)
- acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
+ acc_t acc = 0;
+ iy = oy * stride_y - pad_top;
+ ix = ox * stride_x - pad_left;
+ for_each(0 <= ky < KH, 0 <= kx < KW) {
+ y = iy + ky * dilation_y;
+ x = ix + kx * dilation_x;
+ in_t value = tensor_read<in_t>(input, [N,H,W,C], [n,y,x,c], input_zp, pad);
+ weight_t weight = tensor_read<weight_t>(weight, [KH,KW,C,M], [ky,kx,c,m], weight_zp);
+ acc = apply_add<acc_t>(acc, value * weight);
}
- acc = apply_add<acc_t>(acc, bias[(c*M) + m])
- tensor_write<acc_t>(output, [N,H,W,C*M], [n,oy,ox,c*M+m], acc)
+ acc = apply_add<acc_t>(acc, bias[(c * M) + m]);
+ tensor_write<acc_t>(output, [N,H,W,C * M], [n,oy,ox,c * M + m], acc);
}
----
@@ -302,10 +303,10 @@ for_each (0 <= n<N, 0 <= oy < H, 0 <= ox < W; 0 <= c < (C * M), 0 <= m < M) {
|===
|Profile|Mode|in_t|weight_t|acc_t
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
|===
==== FULLY_CONNECTED
@@ -336,17 +337,17 @@ Performs a fully connected network.
[source,c]
----
-assert(in_t == int8_t || input_zp == 0) // Zero point only for int8
-assert(weight_t == int8_t || weight_zp == 0)
-for_each (0 <= n < N, 0 <= oc < OC) {
- acc_t acc = 0
- for_each (0 <= ic < IC) {
- in_t value = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp)
- weight_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp)
- acc = apply_add<acc_t>(acc, value * weight)
+assert(in_t == int8_t || input_zp == 0); // Zero point only for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+for_each(0 <= n < N, 0 <= oc < OC) {
+ acc_t acc = 0;
+ for_each(0 <= ic < IC) {
+ in_t value = tensor_read<in_t>(input, [N,IC], [n,ic], input_zp);
+ weight_t weight = tensor_read<weight_t>(weight, [OC,IC], [oc,ic], weight_zp);
+ acc = apply_add<acc_t>(acc, value * weight);
}
- acc = apply_add<acc_t>(acc, bias[oc])
- tensor_write<acc_t>(output, [N,OC], [n,oc], acc)
+ acc = apply_add<acc_t>(acc, bias[oc]);
+ tensor_write<acc_t>(output, [N,OC], [n,oc], acc);
}
----
@@ -355,10 +356,10 @@ for_each (0 <= n < N, 0 <= oc < OC) {
|===
|Profile|Mode|in_t|weight_t|acc_t
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8 |int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8 |int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
|===
==== MATMUL
@@ -387,15 +388,15 @@ Performs two dimensional matrix multiplications. This allows both inputs to be a
[source,c]
----
-assert(in_t == int8_t || (A_zp == 0 && B_zp == 0)) // Zero point only for int8
-for_each (0 <= n < N, 0 <= h < H, 0 <= w < W) {
- acc_t acc = 0
- for_each (0 <= c < C) {
- in_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp)
- in_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp)
- acc = apply_add<acc_t>(acc, value1 * value2)
+assert(in_t == int8_t || (A_zp == 0 && B_zp == 0)); // Zero point only for int8_t
+for_each(0 <= n < N, 0 <= h < H, 0 <= w < W) {
+ acc_t acc = 0;
+ for_each(0 <= c < C) {
+ in_t value1 = tensor_read<in_t>(A, [N,H,C], [n,h,c], A_zp);
+ in_t value2 = tensor_read<in_t>(B, [N,C,W], [n,c,w], B_zp);
+ acc = apply_add<acc_t>(acc, value1 * value2);
}
- tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc)
+ tensor_write<acc_t>(output, [N,H,W], [n,h,w], acc);
}
----
@@ -404,9 +405,9 @@ for_each (0 <= n < N, 0 <= h < H, 0 <= w < W) {
|===
|Profile|Mode|in_t|acc_t
-|Any|signed 8x8|int8|int32
-|Any|signed 16x16|int16|int48
-|MI, MT|float|float|float
+|Any|signed 8x8|int8_t|int32_t
+|Any|signed 16x16|int16_t|int48_t
+|MI, MT|floating-point|float_t|float_t
|===
==== MAX_POOL2D
@@ -421,7 +422,7 @@ This performs a max pooling over the given input tensor. A sliding window of siz
|Attribute|int*|kernel|[2]|[kernel_y, kernel_x]
|Attribute|int*|stride|[2]|[stride_y, stride_x]
|Attribute|int*|pad|[4]|[pad_top, pad_bottom, pad_left, pad_right]
-|Output|out_t*|output|[N,H,W,C]|Output tensor 4D
+|Output|in_t*|output|[N,H,W,C]|Output tensor 4D
|===
*Quantization Parameters:*
@@ -432,29 +433,29 @@ None
[source,c]
----
-pad=concat([0,0], pad, [0,0])
-for_each (0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
+pad = flatten([0,0], pad, [0,0]);
+for_each(0 <= n < N, 0 <= oy < H, 0 <= ox < W, 0 <= c < C ) {
in_t acc = minimum_value<in_t>;
- iy = oy * stride_y - pad_top
- ix = ox * stride_x - pad_left
- for_each ( 0<=ky<kernel_y, 0<=kx<kernel_x ) {
- y = iy + ky
- x = ix + kx
- in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], pad)
- acc = apply_max(acc, value)
+ iy = oy * stride_y - pad_top;
+ ix = ox * stride_x - pad_left;
+ for_each( 0<=ky<kernel_y, 0<=kx<kernel_x ) {
+ y = iy + ky;
+ x = ix + kx;
+ in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,y,x,c], pad);
+ acc = apply_max(acc, value);
}
- tensor_write<out_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc)
+ tensor_write<in_t>(output, [N,H,W,OC], [n,oy,ox,oc], acc);
}
----
*Supported Data Types:*
|===
-|Profile|Mode|in_t|out_t
+|Profile|Mode|in_t
-|Any|signed 8|int8|int8
-|Any|16-bit|int16|int16
-|MI, MT|float|float|float
+|Any|signed 8|int8_t
+|Any|16-bit|int16_t
+|MI, MT|floating-point|float_t
|===
==== TRANSPOSE_CONV2D
@@ -488,21 +489,21 @@ Performs a 2D transposed convolution over the given tensor input, using the weig
[source,c]
----
-assert(in_t == int8_t || input_zp == 0) // Zero point only allowed for int8
-assert(weight_t == int8_t || weight_zp == 0)
-for_each (index in out_shape) {
+assert(in_t == int8_t || input_zp == 0); // Zero point only allowed for int8_t
+assert(weight_t == int8_t || weight_zp == 0);
+for_each(index in out_shape) {
tensor_write<acc_t>(output, [N,OH,OW,OC], index, bias[index[3]])
}
-for_each (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
+for_each(0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
0 <= ic < IC, 0 <= ky < KH, 0 <= kx < KW) {
- oy = iy * stride_y - out_pad_top + ky
- ox = ix * stride_x - out_pad_left + kx
- if (oy>=0 && oy<OH && ox>=0 && ox<OW) {
- acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc])
- in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp)
- weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp)
- acc = apply_add<acc_t>(acc, value * weight)
- tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc)
+ oy = iy * stride_y - out_pad_top + ky;
+ ox = ix * stride_x - out_pad_left + kx;
+ if (oy >= 0 && oy < OH && ox >= 0 && ox < OW) {
+ acc_t acc = tensor_read<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc]);
+ in_t value = tensor_read<in_t>(input, [N,IH,IW,IC], [n,iy,ix,ic], input_zp);
+ weight_t weight = tensor_read<weight_t>(weight, [OC,KH,KW,IC], [oc,ky,kx,ic], weight_zp);
+ acc = apply_add<acc_t>(acc, value * weight);
+ tensor_write<acc_t>(output, [N,OH,OW,OC], [n,oy,ox,oc], acc);
}
}
----
@@ -512,8 +513,8 @@ for_each (0 <= n < N, 0 <= iy < IH, 0 <= ix < IW, 0 <= oc < OC,
|===
|Profile|Mode|in_t|weight_t|acc_t
-|Any|signed 8x8|int8|int8|int32
-|Any|signed 8x4|int8|int4|int32
-|Any|signed 16x8|int16|int8|int48
-|MI, MT|float|float|float|float
+|Any|signed 8x8|int8_t|int8_t|int32_t
+|Any|signed 8x4|int8_t|int4_t|int32_t
+|Any|signed 16x8|int16_t|int8_t|int48_t
+|MI, MT|floating-point|float_t|float_t|float_t
|===