aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp14
-rw-r--r--src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp8
2 files changed, 15 insertions, 7 deletions
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
index b295a0c685..3d7a16dd45 100644
--- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
+++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp
@@ -343,8 +343,15 @@ void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, Kern
ARM_COMPUTE_UNUSED(info);
ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, reinterpret_cast<T *>(_output->buffer()),
- _matrix_stride, _num_channels);
+ const int element_size_in_bytes = _input_nhwc->info()->element_size();
+ const int input_col_stride = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes;
+ const int input_row_stride = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes;
+ const int input_batch_stride = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes;
+
+ InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes()),
+ _num_batches, _num_rows, _num_cols, _num_channels, _padding,
+ reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes()),
+ _matrix_stride, _num_channels, input_batch_stride, input_row_stride, input_col_stride);
// The code below cannot be moved to configure because biases hasn't been allocated at that point
const size_t fst = window.x().start();
@@ -446,7 +453,8 @@ void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, Ker
ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc);
OutputTransform output_transform(reinterpret_cast<T *>(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride,
- (_biases ? reinterpret_cast<T *>(_biases->buffer()) : nullptr), reinterpret_cast<T *>(_output_nhwc->buffer()),
+ (_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr),
+ reinterpret_cast<T *>(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes()),
_num_batches, _num_rows, _num_cols, _num_channels, 0, _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T), _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T));
// The code below cannot be moved to configure because biases hasn't been allocated at that point
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index e60fe80e0f..e4a7214c10 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -174,7 +174,7 @@ void Fallback<TypeInput, TypeOutput>::prepare()
if(_gemm_kernel_asm->B_pretranspose_required())
{
const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
- const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
+ const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr);
@@ -223,9 +223,9 @@ void Fallback<TypeInput, TypeOutput>::run()
const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
const int multi_stride_d = _d->info()->strides_in_bytes()[3] / sizeof(TypeOutput);
- const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer());
- const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer());
- auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer());
+ const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
+ const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
+ auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
// Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
if(_workspace.buffer() != nullptr)