aboutsummaryrefslogtreecommitdiff
path: root/reference_model/src/ops/tensor_ops.cc
diff options
context:
space:
mode:
authorJames Ward <james.ward@arm.com>2023-10-09 16:51:26 +0000
committerEric Kunze <eric.kunze@arm.com>2023-10-20 16:32:14 +0000
commit5a9e0cda44bf937c325df42b493cfa8d97b70e52 (patch)
treee36b7b929ef91115cc99a2d18e29b6cda2e6edf4 /reference_model/src/ops/tensor_ops.cc
parentc48590e3097308b420b57b5816e3e21bf47c88b0 (diff)
downloadreference_model-5a9e0cda44bf937c325df42b493cfa8d97b70e52.tar.gz
Fix AVG_POOL2D bad_alloc with large kernel sizes
- Nested looping instead of using Eigen extract_image_patches to avoid OOM errors Signed-off-by: James Ward <james.ward@arm.com> Change-Id: Id4d78d5b5dd04a00134f29b1d29f814195515b1f
Diffstat (limited to 'reference_model/src/ops/tensor_ops.cc')
-rw-r--r--reference_model/src/ops/tensor_ops.cc117
1 files changed, 51 insertions, 66 deletions
diff --git a/reference_model/src/ops/tensor_ops.cc b/reference_model/src/ops/tensor_ops.cc
index d9608b7..65e05db 100644
--- a/reference_model/src/ops/tensor_ops.cc
+++ b/reference_model/src/ops/tensor_ops.cc
@@ -537,16 +537,6 @@ int OpAvgPool2d<Dtype, AccDtype>::eval()
in_batch, in_height, in_width, in_channels, out_batch, out_height, out_width, out_channels, kernel_y,
kernel_x, stride_y, stride_x, pad_top, pad_bottom, pad_left, pad_right, EnumNamesDType()[accum_dtype]);
- Eigen::array<Eigen::Index, 2> im2col_input_dims;
- im2col_input_dims[0] = kernel_y * kernel_x;
- im2col_input_dims[1] = out_batch * out_height * out_width * out_channels;
-
- Eigen::array<Eigen::Index, 4> col2im_output_dims;
- col2im_output_dims[0] = out_batch;
- col2im_output_dims[1] = out_height;
- col2im_output_dims[2] = out_width;
- col2im_output_dims[3] = out_channels;
-
Eigen::array<std::pair<int32_t, int32_t>, 4> pad;
pad[0] = std::make_pair(0, 0);
pad[1] = std::make_pair(pad_top, pad_bottom);
@@ -571,68 +561,63 @@ int OpAvgPool2d<Dtype, AccDtype>::eval()
// so input and output scaling is not required
// TODO: check if this assumption TOSA made
- // extract_image_patches() output [N, KH, KW, H * W, C]
- // transpose to [KH, KW, N, H * W, C]
- // reshape to [KH * KW, N * H * W * C]
- ETensor2<InEigenType> input_extract_patches =
- input_padded.extract_image_patches(kernel_y, kernel_x, stride_y, stride_x, 1, 1, Eigen::PADDING_VALID)
- .shuffle(Eigen::array<Eigen::Index, 5>{ 1, 2, 0, 3, 4 })
- .reshape(im2col_input_dims);
-
- // 1D result with [N * H * W * C]
- ETensor1<AccEigenType> out_1d(this->out->getElementCount());
- out_1d.setZero();
+ ETensor4<OutEigenType> out_tens(out_batch, out_height, out_width, out_channels);
// sum pool
- for (size_t i = 0; i < this->out->getElementCount(); i++)
- {
- for (int32_t j = 0; j < kernel_y * kernel_x; j++)
- {
- out_1d(i) += (AccEigenType)input_extract_patches(j, i);
- }
- }
-
- // reshape result to [N, H, W, C] and divide with div_map
- ETensor4<AccEigenType> sum = out_1d.reshape(col2im_output_dims);
-
- // calculate 1d height/width div_map (number of elements this pooling window covers)
- // and outer product to get 2d div_map, then reshape/broadcast to [N, H, W, C]
- ETensor1<int32_t> div_map_h = calculate_div_map_1d(in_height, out_height, kernel_y, stride_y, pad_top, pad_bottom);
- ETensor1<int32_t> div_map_w = calculate_div_map_1d(in_width, out_width, kernel_x, stride_x, pad_left, pad_right);
- Eigen::array<Eigen::IndexPair<Eigen::Index>, 1> contract_dims = { Eigen::IndexPair<Eigen::Index>(1, 0) };
- Eigen::array<Eigen::Index, 4> bcast{ out_batch, 1, 1, out_channels };
-
- ETensor2<int32_t> dm2_w = div_map_w.reshape(Eigen::array<Eigen::Index, 2>{ 1, out_width });
- ETensor2<int32_t> dm2_h = div_map_h.reshape(Eigen::array<Eigen::Index, 2>{ out_height, 1 });
- ETensor4<int32_t> div_map = dm2_h.contract(dm2_w, contract_dims)
- .reshape(Eigen::array<Eigen::Index, 4>{ 1, out_height, out_width, 1 })
- .broadcast(bcast);
- if (Dtype != TOSA_REF_TYPE_FP32 && Dtype != TOSA_REF_TYPE_FP16 && Dtype != TOSA_REF_TYPE_BF16 &&
- Dtype != TOSA_REF_TYPE_FP64)
+ for (int ob = 0; ob < out_batch; ++ob)
{
- try
- {
- this->out->getTensor() = sum.binaryExpr(div_map, [](AccEigenType value, int32_t div) -> OutEigenType {
- int32_t multiplier, shift;
- TosaReference::QuantUtil::reciprocal_scale(div, multiplier, shift);
-
- return (OutEigenType)TosaReference::QuantUtil::apply_scale_32(value, multiplier, shift, false);
- });
- }
- catch (std::string desc)
+ for (int oh = 0; oh < out_height; ++oh)
{
- REQUIRE(false, "OpAvgPool2d apply_scale_32() fails: %s.", desc.c_str());
+ for (int ow = 0; ow < out_width; ++ow)
+ {
+ for (int oc = 0; oc < out_channels; ++oc)
+ {
+ AccEigenType acc(0);
+ int filter_count = 0;
+ const int iy = oh * stride_y - pad_top;
+ const int ix = ow * stride_x - pad_left;
+ for (int ky = 0; ky < kernel_y; ++ky)
+ {
+ for (int kx = 0; kx < kernel_x; ++kx)
+ {
+ const int y = iy + ky;
+ const int x = ix + kx;
+ if ((0 <= y && y < in_height) && (0 <= x && x < in_width))
+ {
+ ++filter_count;
+ acc = acc + (AccEigenType)input_padded(ob, y, x, oc);
+ }
+ }
+ }
+ if (Dtype != TOSA_REF_TYPE_FP32 && Dtype != TOSA_REF_TYPE_FP16 && Dtype != TOSA_REF_TYPE_BF16 &&
+ Dtype != TOSA_REF_TYPE_FP64)
+ {
+ try
+ {
+ int32_t multiplier, shift;
+ OutEigenType out;
+ TosaReference::QuantUtil::reciprocal_scale(filter_count, multiplier, shift);
+
+ out = (OutEigenType)TosaReference::QuantUtil::apply_scale_32(acc, multiplier, shift, false);
+ out = out + (OutEigenType)(attribute->output_zp());
+ out = std::max(out, (OutEigenType)QMin);
+ out_tens(ob, oh, ow, oc) = std::min(out, (OutEigenType)QMax);
+ }
+ catch (std::string desc)
+ {
+ REQUIRE(false, "OpAvgPool2d apply_scale_32() fails: %s.", desc.c_str());
+ }
+ }
+ else
+ {
+ REQUIRE(filter_count != 0, "OpAvgPool2d number of filters should be non-zero.");
+ out_tens(ob, oh, ow, oc) = acc / static_cast<OutEigenType>(filter_count);
+ }
+ }
+ }
}
- this->out->getTensor() = this->out->getTensor() + (OutEigenType)(attribute->output_zp());
- this->out->getTensor() = this->out->getTensor().cwiseMax((OutEigenType)QMin);
- this->out->getTensor() = this->out->getTensor().cwiseMin((OutEigenType)QMax);
- }
- else
- {
- // Case for float-types
- this->out->getTensor() = (sum / div_map.template cast<AccEigenType>()).template cast<OutEigenType>();
}
-
+ this->out->getTensor() = out_tens;
return GraphNode::eval();
}