diff options
author | Georgios Pinitas <georgios.pinitas@arm.com> | 2018-07-27 18:28:10 +0100 |
---|---|---|
committer | Anthony Barbier <anthony.barbier@arm.com> | 2018-11-02 16:54:54 +0000 |
commit | eb84d6b82ba744295f5249d2171ee6537e09751a (patch) | |
tree | dd736285c3d1c6174fbf5650e55b3f6199e91473 | |
parent | c50da386dc26a6e0a1690a47e72d5fa766e7dba2 (diff) | |
download | ComputeLibrary-eb84d6b82ba744295f5249d2171ee6537e09751a.tar.gz |
COMPMID-1434: Fix NEWinograd for NHWC and sub-tensors
Apply offsets and strides to winograd transform functions in NEON.
Change-Id: Ia4f44d22244203a5f9d93d2fed73570396b0d28c
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/141803
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
-rw-r--r-- | src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp | 14 | ||||
-rw-r--r-- | src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp | 8 |
2 files changed, 15 insertions, 7 deletions
diff --git a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp index b295a0c685..3d7a16dd45 100644 --- a/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp +++ b/src/core/NEON/kernels/NEWinogradConvolutionLayerKernel.cpp @@ -343,8 +343,15 @@ void NEWinogradLayerTransformInputKernel<T, OutputTileRows, OutputTileCols, Kern ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); - InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer()), _num_batches, _num_rows, _num_cols, _num_channels, _padding, reinterpret_cast<T *>(_output->buffer()), - _matrix_stride, _num_channels); + const int element_size_in_bytes = _input_nhwc->info()->element_size(); + const int input_col_stride = _input_nhwc->info()->strides_in_bytes().y() / element_size_in_bytes; + const int input_row_stride = _input_nhwc->info()->strides_in_bytes().z() / element_size_in_bytes; + const int input_batch_stride = _input_nhwc->info()->strides_in_bytes()[3] / element_size_in_bytes; + + InputTransform input_transform(reinterpret_cast<const T *>(_input_nhwc->buffer() + _input_nhwc->info()->offset_first_element_in_bytes()), + _num_batches, _num_rows, _num_cols, _num_channels, _padding, + reinterpret_cast<T *>(_output->buffer() + _output->info()->offset_first_element_in_bytes()), + _matrix_stride, _num_channels, input_batch_stride, input_row_stride, input_col_stride); // The code below cannot be moved to configure because biases hasn't been allocated at that point const size_t fst = window.x().start(); @@ -446,7 +453,8 @@ void NEWinogradLayerTransformOutputKernel<T, OutputTileRows, OutputTileCols, Ker ARM_COMPUTE_ERROR_ON_NULLPTR(_output_nhwc); OutputTransform output_transform(reinterpret_cast<T *>(_output_workspace->buffer()), _matrix_stride, _matrix_row_stride, - (_biases ? reinterpret_cast<T *>(_biases->buffer()) : nullptr), reinterpret_cast<T *>(_output_nhwc->buffer()), + (_biases ? reinterpret_cast<T *>(_biases->buffer() + _biases->info()->offset_first_element_in_bytes()) : nullptr), + reinterpret_cast<T *>(_output_nhwc->buffer() + _output_nhwc->info()->offset_first_element_in_bytes()), _num_batches, _num_rows, _num_cols, _num_channels, 0, _output_nhwc->info()->strides_in_bytes()[2] / sizeof(T), _output_nhwc->info()->strides_in_bytes()[1] / sizeof(T)); // The code below cannot be moved to configure because biases hasn't been allocated at that point diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp index e60fe80e0f..e4a7214c10 100644 --- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp +++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp @@ -174,7 +174,7 @@ void Fallback<TypeInput, TypeOutput>::prepare() if(_gemm_kernel_asm->B_pretranspose_required()) { const int ldb = _b->info()->strides_in_bytes().y() / sizeof(TypeInput); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer()); + const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes()); const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput); ARM_COMPUTE_ERROR_ON(_pretranspose.buffer() == nullptr); @@ -223,9 +223,9 @@ void Fallback<TypeInput, TypeOutput>::run() const int multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput); const int multi_stride_d = _d->info()->strides_in_bytes()[3] / sizeof(TypeOutput); - const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer()); - const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer()); - auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer()); + const auto in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes()); + const auto in1_ptr = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes()); + auto out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes()); // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads if(_workspace.buffer() != nullptr) |