aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorManuel Bottini <manuel.bottini@arm.com>2021-07-06 15:01:35 +0100
committerManuel Bottini <manuel.bottini@arm.com>2021-07-15 16:27:56 +0000
commit29599d01a8f498e33b9c6995bd879473dc02e077 (patch)
tree2dcbd399c5cf0a87edbd1885e29e8cb1ed1ce9bc
parent13ef1763d6eef9606afaed90fb258d1a4577f15b (diff)
downloadComputeLibrary-29599d01a8f498e33b9c6995bd879473dc02e077.tar.gz
Port NEGEMMConvolutionLayer
Details: port NEWeightsReshapeKernel to CpuWeightsReshapeKernel port NEGEMMConvolutionLayer to CpuGEMMConvolutionLayer Resolves: COMPMID-4509 Change-Id: I3c7051e2c3f6d808a7ccb898aad70e5b221b9dc3 Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5938 Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
-rw-r--r--Android.bp3
-rw-r--r--arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h201
-rw-r--r--docs/user_guide/release_version_and_change_log.dox6
-rw-r--r--filelist.json5
-rw-r--r--src/core/NEON/NEKernels.h1
-rw-r--r--src/core/NEON/kernels/NEWeightsReshapeKernel.cpp177
-rw-r--r--src/core/cpu/kernels/CpuIm2ColKernel.cpp2
-rw-r--r--src/core/cpu/kernels/CpuIm2ColKernel.h2
-rw-r--r--src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp170
-rw-r--r--src/core/cpu/kernels/CpuWeightsReshapeKernel.h (renamed from src/core/NEON/kernels/NEWeightsReshapeKernel.h)66
-rw-r--r--src/runtime/NEON/functions/NEDeconvolutionLayer.cpp1
-rw-r--r--src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp641
-rw-r--r--src/runtime/cpu/operators/CpuGemmConvolution.cpp602
-rw-r--r--src/runtime/cpu/operators/CpuGemmConvolution.h197
-rw-r--r--tests/validation/NEON/ConvolutionLayer.cpp96
15 files changed, 1168 insertions, 1002 deletions
diff --git a/Android.bp b/Android.bp
index b32a9da895..f2934cb37d 100644
--- a/Android.bp
+++ b/Android.bp
@@ -176,7 +176,6 @@ cc_library_static {
"src/core/NEON/kernels/NEStackLayerKernel.cpp",
"src/core/NEON/kernels/NEStridedSliceKernel.cpp",
"src/core/NEON/kernels/NETileKernel.cpp",
- "src/core/NEON/kernels/NEWeightsReshapeKernel.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp16.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_fp32.cpp",
"src/core/NEON/kernels/arm_conv/depthwise/depthwise_s8q.cpp",
@@ -283,6 +282,7 @@ cc_library_static {
"src/core/cpu/kernels/CpuSoftmaxKernel.cpp",
"src/core/cpu/kernels/CpuSubKernel.cpp",
"src/core/cpu/kernels/CpuTransposeKernel.cpp",
+ "src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp",
"src/core/cpu/kernels/CpuWinogradConv2dKernel.cpp",
"src/core/cpu/kernels/activation/neon/fp16.cpp",
"src/core/cpu/kernels/activation/neon/fp32.cpp",
@@ -639,6 +639,7 @@ cc_library_static {
"src/runtime/cpu/operators/CpuFlatten.cpp",
"src/runtime/cpu/operators/CpuFloor.cpp",
"src/runtime/cpu/operators/CpuGemm.cpp",
+ "src/runtime/cpu/operators/CpuGemmConvolution.cpp",
"src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp",
"src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp",
"src/runtime/cpu/operators/CpuGemmLowpOutputStage.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index e3b7d91187..2ebb80bef2 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -27,143 +27,21 @@
#include "arm_compute/runtime/IFunction.h"
#include "arm_compute/core/Types.h"
+#include "arm_compute/runtime/IFunction.h"
+#include "arm_compute/runtime/IMemoryManager.h"
#include "arm_compute/runtime/IWeightsManager.h"
#include "arm_compute/runtime/MemoryGroup.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMM.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h"
-#include "arm_compute/runtime/NEON/functions/NEGEMMLowpOutputStage.h"
-#include "arm_compute/runtime/NEON/functions/NEReshapeLayer.h"
-#include "arm_compute/runtime/Tensor.h"
#include <memory>
namespace arm_compute
{
class ITensor;
-class NEWeightsReshapeKernel;
-namespace cpu
-{
-namespace kernels
-{
-class CpuIm2ColKernel;
-class CpuCol2ImKernel;
-} // namespace kernels
-} // namespace cpu
-
-/** Function to reshape the weights. This function calls the following kernel:
- * -# @ref NEWeightsReshapeKernel
- */
-class NEConvolutionLayerReshapeWeights : public IFunction
-{
-public:
- /** Constructor */
- NEConvolutionLayerReshapeWeights() noexcept;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeights(const NEConvolutionLayerReshapeWeights &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeights(NEConvolutionLayerReshapeWeights &&) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeights &operator=(const NEConvolutionLayerReshapeWeights &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeights &operator=(NEConvolutionLayerReshapeWeights &&) = delete;
- /** Default destructor */
- ~NEConvolutionLayerReshapeWeights();
- /** Set the input and output tensors.
- *
- * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: All.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: same as @p weights.
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[out] output Destination tensor. Data types supported: same as @p weights.
- */
- void configure(const ITensor *weights, const ITensor *biases, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref NEConvolutionLayerReshapeWeights
- *
- * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
- * Data type supported: All.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: same as @p weights.
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[in] output Destination tensor. Data types supported: same as @p weights.
- *
- * @return an error status
- */
- static Status validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output);
-
- // Inherited methods overridden:
- void run() override;
-
-private:
- std::unique_ptr<NEWeightsReshapeKernel> _weights_reshape_kernel;
-};
-
-namespace weights_transformations
-{
-/** Basic function to manage the reshape weights generated from @ref NEConvolutionLayerReshapeWeights */
-class NEConvolutionLayerReshapeWeightsTransform : public ITransformWeights
-{
-public:
- /** Constructor */
- NEConvolutionLayerReshapeWeightsTransform() = default;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeightsTransform(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEConvolutionLayerReshapeWeightsTransform &operator=(const NEConvolutionLayerReshapeWeightsTransform &) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeightsTransform(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
- /** Prevent instances of this class from being moved (As this class contains non movable objects) */
- NEConvolutionLayerReshapeWeightsTransform &operator=(NEConvolutionLayerReshapeWeightsTransform &&) = delete;
- /** Default destructor */
- ~NEConvolutionLayerReshapeWeightsTransform() = default;
- void configure(const ITensor *input, const ITensor *biases)
- {
- _bias_bit = (biases != nullptr) ? 1 : 0;
- _func.configure(input, biases, &_output);
- }
-
- void run() override
- {
- _output.allocator()->allocate();
- _func.run();
- _reshape_run = true;
- }
-
- ITensor *get_weights() override
- {
- return &_output;
- }
-
- void release() override
- {
- _output.allocator()->free();
- }
-
- uint32_t uid() override
- {
- return ((0x8) | (_bias_bit << 7));
- }
-
- bool is_reshape_run()
- {
- return _reshape_run;
- }
-
-private:
- Tensor _output{};
- NEConvolutionLayerReshapeWeights _func{};
- int32_t _bias_bit{ 0 };
-};
-} // namespace weights_transformations
+class ITensorInfo;
/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
*
- * -# @ref cpu::kernels::CpuIm2ColKernel
- * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
- * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEGEMMLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref NEArithmeticAddition (if biases != nullptr and we have a 1x1 convolution with the NHWC data layout)
- * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout)
+ * -# @ref cpu::CpuGemmConvolution
*
*/
class NEGEMMConvolutionLayer : public IFunction
@@ -244,73 +122,8 @@ public:
void prepare() override;
private:
- /** Configures the appropriate matrix multiply routine
- *
- * @param[in] input Input tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights Weights tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
- * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[out] output Output tensor. Data types supported: Same as @p input,
- * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
- */
- void configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(), int gemm_3d_depth = 1);
- /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
- *
- * @param[in] input Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
- * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
- * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
- * @param[in] output Output tensor info. Data types supported: Same as @p input,
- * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
- * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
- * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
- *
- * @return a status
- */
- static Status validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
- int gemm_3d_depth = 1, bool skip_im2col = false);
- /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref NEGEMMLowpMatrixMultiplyCore
- *
- * @param[in] input_info Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] weights_info Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
- * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
- * @param[in] gemm_3d_depth Depth of GEMM 3D
- * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
- *
- * @return a status
- */
- static Status validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
-
-private:
- MemoryGroup _memory_group;
- IWeightsManager *_weights_manager;
- NEConvolutionLayerReshapeWeights _reshape_weights;
- weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
- std::unique_ptr<cpu::kernels::CpuIm2ColKernel> _im2col_kernel;
- NEGEMM _mm_gemm;
- NEGEMMLowpMatrixMultiplyCore _mm_gemmlowp;
- std::unique_ptr<cpu::kernels::CpuCol2ImKernel> _col2im_kernel;
- NEReshapeLayer _reshape_layer;
-
- const ITensor *_input;
- const ITensor *_original_weights;
- ITensor *_original_output;
-
- Tensor _im2col_output;
- Tensor _weights_reshaped;
- Tensor _gemm_output;
- Tensor _gemm_output_3d;
- Tensor _tmp_output;
-
- DataLayout _data_layout;
-
- bool _skip_im2col;
- bool _skip_col2im;
- bool _is_quantized;
- bool _is_prepared;
+ struct Impl;
+ std::unique_ptr<Impl> _impl;
};
} // namespace arm_compute
-#endif /* ARM_COMPUTE_NECONVOLUTIONGEMMLAYER_H */
+#endif /* ARM_COMPUTE_NEGEMMCONVOLUTIONLAYER_H */
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index d8258a10cd..8b15a384cc 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -583,8 +583,8 @@ v20.05 Public major release
- Updated recommended gcc version to Linaro 6.3.1.
- Added Bfloat16 type support
- Added Bfloat16 support in:
- - @ref NEWeightsReshapeKernel
- - @ref NEConvolutionLayerReshapeWeights
+ - NEWeightsReshapeKernel
+ - NEConvolutionLayerReshapeWeights
- NEIm2ColKernel
- NEIm2Col
- NEDepthConvertLayerKernel
@@ -1321,7 +1321,7 @@ v17.06 Public major release
- NEDepthConcatenateLayerKernel / NEDepthConcatenateLayer
- NEDirectConvolutionLayerKernel / @ref NEDirectConvolutionLayer
- NELocallyConnectedMatrixMultiplyKernel / NELocallyConnectedLayer
- - @ref NEWeightsReshapeKernel / @ref NEConvolutionLayerReshapeWeights
+ - NEWeightsReshapeKernel / NEConvolutionLayerReshapeWeights
v17.05 Public bug fixes release
- Various bug fixes
diff --git a/filelist.json b/filelist.json
index d520f8e1dd..c311af459d 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1187,8 +1187,11 @@
"ConvertQuantizedSignedness"
],
"files": {
+ "operator": [
+ "src/runtime/cpu/operators/CpuGemmConvolution.cpp"
+ ],
"kernel": [
- "src/core/NEON/kernels/NEWeightsReshapeKernel.cpp"
+ "src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp"
]
}
},
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 6c6c51dd87..6d45a9d80c 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -65,6 +65,5 @@
#include "src/core/NEON/kernels/NEStackLayerKernel.h"
#include "src/core/NEON/kernels/NEStridedSliceKernel.h"
#include "src/core/NEON/kernels/NETileKernel.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
#endif /* ARM_COMPUTE_NEKERNELS_H */
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp b/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
deleted file mode 100644
index 9bef9c30d9..0000000000
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.cpp
+++ /dev/null
@@ -1,177 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
-
-#include "arm_compute/core/Helpers.h"
-#include "arm_compute/core/Validate.h"
-#include "src/core/helpers/AutoConfiguration.h"
-#include "src/core/helpers/WindowHelpers.h"
-
-namespace arm_compute
-{
-namespace
-{
-TensorShape get_output_shape(const ITensorInfo *input, bool has_bias)
-{
- TensorShape output_shape{ input->tensor_shape() };
-
- output_shape.collapse(3);
- const size_t tmp_dim = output_shape[0];
- output_shape.set(0, output_shape[1]);
- output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
-
- return output_shape;
-}
-
-Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(input) is not needed here as this kernel doesn't use CPU FP16 instructions.
- ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN);
-
- if(biases != nullptr)
- {
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(input->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3]));
- ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4]));
- }
-
- // Checks performed when output is configured
- if(output->total_size() != 0)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), get_output_shape(input, biases != nullptr));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output);
- }
-
- return Status{};
-}
-
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input)
-{
- Window window = calculate_max_window(*input, Steps());
- window.set(Window::DimX, Window::Dimension(0, input->dimension(0), input->dimension(0)));
- window.set(Window::DimY, Window::Dimension(0, input->dimension(1), input->dimension(1)));
- window.set(Window::DimZ, Window::Dimension(0, input->dimension(2), input->dimension(2)));
-
- // The NEConvolutionLayerWeightsReshapeKernel doesn't need padding so update_window_and_padding() can be skipped
-
- return std::make_pair(Status{}, window);
-}
-} // namespace
-
-NEWeightsReshapeKernel::NEWeightsReshapeKernel()
- : _input(nullptr), _bias(nullptr), _output(nullptr)
-{
-}
-
-void NEWeightsReshapeKernel::configure(const ITensor *input, const ITensor *bias, ITensor *output)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
- // Output tensor auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), input->info()->clone()->set_tensor_shape(get_output_shape(input->info(), (bias != nullptr))));
-
- // Perform validation step
- ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(),
- (bias != nullptr) ? bias->info() : nullptr,
- output->info()));
-
- _input = input;
- _bias = bias;
- _output = output;
-
- // Configure kernel
- auto win_config = validate_and_configure_window(input->info());
- ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
- INEKernel::configure(win_config.second);
-}
-
-Status NEWeightsReshapeKernel::validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, biases, output));
- ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get()).first);
-
- return Status{};
-}
-
-void NEWeightsReshapeKernel::run(const Window &window, const ThreadInfo &info)
-{
- ARM_COMPUTE_UNUSED(info);
- ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
- ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
-
- const unsigned int kernel_size_x = _input->info()->dimension(0);
- const unsigned int kernel_size_y = _input->info()->dimension(1);
- const unsigned int kernel_depth = _input->info()->dimension(2);
- const unsigned int input_stride_x = _input->info()->strides_in_bytes().x();
- const unsigned int input_stride_y = _input->info()->strides_in_bytes().y();
- const unsigned int input_stride_z = _input->info()->strides_in_bytes().z();
- const unsigned int output_stride_y = _output->info()->strides_in_bytes().y();
-
- // Create iterators
- Iterator in(_input, window);
- execute_window_loop(window, [&](const Coordinates & id)
- {
- // Get column index
- const int kernel_idx = id[3];
- const int kernel_idz = id[4];
-
- // Setup pointers
- const uint8_t *tmp_input_ptr = in.ptr();
- uint8_t *tmp_output_ptr = _output->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
- const uint8_t *curr_input_row_ptr = tmp_input_ptr;
- const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
-
- // Linearize volume
- for(unsigned int d = 0; d < kernel_depth; ++d)
- {
- for(unsigned int j = 0; j < kernel_size_y; ++j)
- {
- for(unsigned int i = 0; i < kernel_size_x; ++i)
- {
- std::memcpy(tmp_output_ptr, tmp_input_ptr, _input->info()->element_size());
- tmp_input_ptr += input_stride_x;
- tmp_output_ptr += output_stride_y;
- }
- curr_input_row_ptr += input_stride_y;
- tmp_input_ptr = curr_input_row_ptr;
- }
- curr_input_depth_ptr += input_stride_z;
- curr_input_row_ptr = curr_input_depth_ptr;
- tmp_input_ptr = curr_input_depth_ptr;
- }
-
- // Add bias
- if(_bias != nullptr)
- {
- std::memcpy(tmp_output_ptr, _bias->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), _input->info()->element_size());
- }
- },
- in);
-}
-} // namespace arm_compute
diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.cpp b/src/core/cpu/kernels/CpuIm2ColKernel.cpp
index a5dbcc29c8..ca6c9bfab4 100644
--- a/src/core/cpu/kernels/CpuIm2ColKernel.cpp
+++ b/src/core/cpu/kernels/CpuIm2ColKernel.cpp
@@ -331,7 +331,7 @@ void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window
in, out);
}
-void CpuIm2ColKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+void CpuIm2ColKernel::configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
bool has_bias, const Size2D &dilation, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
diff --git a/src/core/cpu/kernels/CpuIm2ColKernel.h b/src/core/cpu/kernels/CpuIm2ColKernel.h
index 4301a237fe..ffac5077b2 100644
--- a/src/core/cpu/kernels/CpuIm2ColKernel.h
+++ b/src/core/cpu/kernels/CpuIm2ColKernel.h
@@ -77,7 +77,7 @@ public:
* @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
* @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
*/
- void configure(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+ void configure(const ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
/** Static function to check if given info will lead to a valid configuration
*
diff --git a/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp b/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp
new file mode 100644
index 0000000000..79f058944d
--- /dev/null
+++ b/src/core/cpu/kernels/CpuWeightsReshapeKernel.cpp
@@ -0,0 +1,170 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h"
+
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/Validate.h"
+#include "src/core/helpers/AutoConfiguration.h"
+#include "src/core/helpers/WindowHelpers.h"
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace kernels
+{
+namespace
+{
+TensorShape get_output_shape(const ITensorInfo *src, bool has_bias)
+{
+ TensorShape output_shape{ src->tensor_shape() };
+
+ output_shape.collapse(3);
+ const size_t tmp_dim = output_shape[0];
+ output_shape.set(0, output_shape[1]);
+ output_shape.set(1, tmp_dim + (has_bias ? 1 : 0));
+
+ return output_shape;
+}
+
+Status validate_arguments(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst);
+ //Note: ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src) is not needed here as this kernel doesn't use CPU FP16 instructions.
+ ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN);
+
+ if(biases != nullptr)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(src->data_type()));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+ ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->num_dimensions() != 1));
+ ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->num_dimensions() != 2));
+ ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 4) && (biases->dimension(0) != src->tensor_shape()[3]));
+ ARM_COMPUTE_RETURN_ERROR_ON((src->num_dimensions() == 5) && (biases->dimension(0) != src->tensor_shape()[3] || biases->dimension(1) != src->tensor_shape()[4]));
+ }
+
+ // Checks performed when output is configured
+ if(dst->total_size() != 0)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), get_output_shape(src, biases != nullptr));
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst);
+ }
+
+ return Status{};
+}
+} // namespace
+
+void CpuWeightsReshapeKernel::configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+
+ // Output tensor auto inizialitation if not yet initialized
+ auto_init_if_empty(*dst, src->clone()->set_tensor_shape(get_output_shape(src, (biases != nullptr))));
+
+ // Perform validation step
+ ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src,
+ biases,
+ dst));
+
+ // Configure kernel
+ Window window = calculate_max_window(*src, Steps());
+ window.set(Window::DimX, Window::Dimension(0, src->dimension(0), src->dimension(0)));
+ window.set(Window::DimY, Window::Dimension(0, src->dimension(1), src->dimension(1)));
+ window.set(Window::DimZ, Window::Dimension(0, src->dimension(2), src->dimension(2)));
+ ICpuKernel::configure(window);
+}
+
+Status CpuWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst)
+{
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst));
+ return Status{};
+}
+
+void CpuWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
+{
+ ARM_COMPUTE_UNUSED(info);
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
+
+ auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+ auto biases = tensors.get_const_tensor(TensorType::ACL_BIAS);
+ auto dst = tensors.get_tensor(TensorType::ACL_DST);
+
+ const unsigned int kernel_size_x = src->info()->dimension(0);
+ const unsigned int kernel_size_y = src->info()->dimension(1);
+ const unsigned int kernel_depth = src->info()->dimension(2);
+ const unsigned int input_stride_x = src->info()->strides_in_bytes().x();
+ const unsigned int input_stride_y = src->info()->strides_in_bytes().y();
+ const unsigned int input_stride_z = src->info()->strides_in_bytes().z();
+ const unsigned int output_stride_y = dst->info()->strides_in_bytes().y();
+
+ // Create iterators
+ Iterator in(src, window);
+ execute_window_loop(window, [&](const Coordinates & id)
+ {
+ // Get column index
+ const int kernel_idx = id[3];
+ const int kernel_idz = id[4];
+
+ // Setup pointers
+ const uint8_t *tmp_input_ptr = in.ptr();
+ uint8_t *tmp_output_ptr = dst->ptr_to_element(Coordinates(kernel_idx, 0, kernel_idz));
+ const uint8_t *curr_input_row_ptr = tmp_input_ptr;
+ const uint8_t *curr_input_depth_ptr = tmp_input_ptr;
+
+ // Linearize volume
+ for(unsigned int d = 0; d < kernel_depth; ++d)
+ {
+ for(unsigned int j = 0; j < kernel_size_y; ++j)
+ {
+ for(unsigned int i = 0; i < kernel_size_x; ++i)
+ {
+ std::memcpy(tmp_output_ptr, tmp_input_ptr, src->info()->element_size());
+ tmp_input_ptr += input_stride_x;
+ tmp_output_ptr += output_stride_y;
+ }
+ curr_input_row_ptr += input_stride_y;
+ tmp_input_ptr = curr_input_row_ptr;
+ }
+ curr_input_depth_ptr += input_stride_z;
+ curr_input_row_ptr = curr_input_depth_ptr;
+ tmp_input_ptr = curr_input_depth_ptr;
+ }
+
+ // Add bias
+ if(biases != nullptr)
+ {
+ std::memcpy(tmp_output_ptr, biases->ptr_to_element(Coordinates(kernel_idx, kernel_idz)), src->info()->element_size());
+ }
+ },
+ in);
+}
+const char *CpuWeightsReshapeKernel::name() const
+{
+ return "CpuWeightsReshapeKernel";
+}
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.h b/src/core/cpu/kernels/CpuWeightsReshapeKernel.h
index 5701c84cac..eea150a96e 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/src/core/cpu/kernels/CpuWeightsReshapeKernel.h
@@ -21,15 +21,18 @@
* OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
* SOFTWARE.
*/
-#ifndef ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
-#define ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H
+#ifndef ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
+#define ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H
-#include "src/core/NEON/INEKernel.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
namespace arm_compute
{
-class ITensor;
-
+namespace cpu
+{
+namespace kernels
+{
/** Kernel to perform reshaping on the weights used by convolution and locally connected layer
*
* Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
@@ -53,57 +56,36 @@ class ITensor;
* \end{array} \right)
* @f]
*/
-class NEWeightsReshapeKernel : public INEKernel
+class CpuWeightsReshapeKernel : public ICpuKernel
{
public:
- const char *name() const override
- {
- return "NEWeightsReshapeKernel";
- }
- /** Constructor.*/
- NEWeightsReshapeKernel();
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEWeightsReshapeKernel(const NEWeightsReshapeKernel &) = delete;
- /** Prevent instances of this class from being copied (As this class contains pointers) */
- NEWeightsReshapeKernel &operator=(const NEWeightsReshapeKernel &) = delete;
- /** Allow instances of this class to be moved */
- NEWeightsReshapeKernel(NEWeightsReshapeKernel &&) = default;
- /** Allow instances of this class to be moved */
- NEWeightsReshapeKernel &operator=(NEWeightsReshapeKernel &&) = default;
- /** Default destructor */
- ~NEWeightsReshapeKernel() = default;
+ /** Default constructor */
+ CpuWeightsReshapeKernel() = default;
+ ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuWeightsReshapeKernel);
/** Set the input and output of the kernel.
*
- * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
+ * @param[in] src The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
* and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
* Data types supported: All
- * @param[in] bias The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
+ * @param[in] biases The shared biases tensor info to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
* dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
* @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[out] output The output tensor. Data types supported: Same as @p input
+ * @param[out] dst The output tensor info. Data types supported: Same as @p src
*/
- void configure(const ITensor *input, const ITensor *bias, ITensor *output);
- /** Static function to check if given info will lead to a valid configuration of @ref NEWeightsReshapeKernel
+ void configure(const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst);
+ /** Static function to check if given info will lead to a valid configuration
*
- * @param[in] input The input tensor to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared,
- * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared.
- * Data types supported: All
- * @param[in] biases The shared biases tensor to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with
- * dimensions [OFM, num_patches] if unshared. Data types supported: Same as @p input
- * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types.
- * @param[in] output The output tensor. Should be a 2D Tensor. Data types supported: Same as @p input
+ * Similar to CpuWeightsReshapeKernel::configure()
*
* @return a status
*/
- static Status validate(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output);
+ static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst);
// Inherited methods overridden:
- void run(const Window &window, const ThreadInfo &info) override;
-
-private:
- const ITensor *_input;
- const ITensor *_bias;
- ITensor *_output;
+ void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+ const char *name() const override;
};
+} // namespace kernels
+} // namespace cpu
} // namespace arm_compute
-#endif /*ARM_COMPUTE_NEWEIGHTSRESHAPEKERNEL_H */
+#endif /* ARM_COMPUTE_CPU_WEIGHTSRESHAPE_KERNEL_H */
diff --git a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
index 5bd61b4074..712f41f369 100644
--- a/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEDeconvolutionLayer.cpp
@@ -28,7 +28,6 @@
#include "arm_compute/core/Validate.h"
#include "arm_compute/core/utils/misc/ShapeCalculator.h"
#include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
#include "src/core/helpers/AutoConfiguration.h"
using namespace arm_compute::misc::shape_calculator;
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index 7c06b0adf5..6386a678db 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -26,618 +26,99 @@
#include "arm_compute/core/Size2D.h"
#include "arm_compute/core/Utils.h"
#include "arm_compute/core/Validate.h"
-#include "arm_compute/core/utils/misc/ShapeCalculator.h"
-#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
-#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/operators/CpuGemmConvolution.h"
-#include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
-#include "src/core/cpu/kernels/CpuCol2ImKernel.h"
-#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
-
-#include <set>
-#include <tuple>
+using namespace arm_compute::experimental;
namespace arm_compute
{
-using namespace arm_compute::misc::shape_calculator;
-
-NEConvolutionLayerReshapeWeights::~NEConvolutionLayerReshapeWeights() = default;
-NEConvolutionLayerReshapeWeights::NEConvolutionLayerReshapeWeights() noexcept
- : _weights_reshape_kernel()
-{
-}
-
-void NEConvolutionLayerReshapeWeights::configure(const ITensor *weights, const ITensor *biases, ITensor *output)
-{
- // Perform validation step
- ARM_COMPUTE_ERROR_ON_NULLPTR(weights, output);
- ARM_COMPUTE_ERROR_THROW_ON(NEConvolutionLayerReshapeWeights::validate(weights->info(),
- (biases != nullptr) ? biases->info() : nullptr,
- output->info()));
- const bool append_biases = (biases != nullptr) && !is_data_type_quantized_asymmetric(weights->info()->data_type());
- const ITensor *biases_to_use = (append_biases) ? biases : nullptr;
-
- _weights_reshape_kernel = std::make_unique<NEWeightsReshapeKernel>();
- _weights_reshape_kernel->configure(weights, biases_to_use, output);
-
- output->info()->set_quantization_info(weights->info()->quantization_info());
-}
-
-Status NEConvolutionLayerReshapeWeights::validate(const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output)
-{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(weights);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1,
- DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL,
- DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- if(biases != nullptr)
- {
- const int idx_kernels = get_data_layout_dimension_index(weights->data_layout(), DataLayoutDimension::BATCHES);
- ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_asymmetric(weights->data_type()));
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases);
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- if((output != nullptr) && (output->total_size() != 0))
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, output);
-
- NEWeightsReshapeKernel::validate(weights, biases, output);
- }
-
- return Status{};
-}
-
-void NEConvolutionLayerReshapeWeights::run()
+struct NEGEMMConvolutionLayer::Impl
{
- NEScheduler::get().schedule(_weights_reshape_kernel.get(), 3);
-}
-
-NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
+ const ITensor *weights{ nullptr };
+ std::unique_ptr<cpu::CpuGemmConvolution> op{ nullptr };
+ ITensorPack run_pack{};
+ ITensorPack prep_pack{};
+ MemoryGroup memory_group{};
+ IWeightsManager *weights_manager{ nullptr };
+ MemoryRequirements aux_mem_req{};
+ WorkspaceData<Tensor> workspace_tensors{};
+ bool is_prepared{ false };
+};
NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
- : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
- _col2im_kernel(), _reshape_layer(), _input(nullptr), _original_weights(nullptr), _original_output(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), _tmp_output(),
- _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false)
-{
-}
-
-void NEGEMMConvolutionLayer::configure_mm(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const ActivationLayerInfo &act_info, int gemm_3d_depth)
-{
- ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights);
- ARM_COMPUTE_ERROR_THROW_ON(validate_mm(input->info(), weights->info(), biases == nullptr ? nullptr : biases->info(), output == nullptr ? nullptr : output->info(),
- act_info, gemm_3d_depth, _skip_im2col));
-
- // Create GEMMInfo structure
- const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
- // Supported activations in GEMM
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
-
- if(_is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo iqinfo = input->info()->quantization_info();
- const QuantizationInfo wqinfo = weights->info()->quantization_info();
- const QuantizationInfo oqinfo = (output->info()->total_size() == 0) ? iqinfo : output->info()->quantization_info();
- const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
- const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
- const DataType data_type = input->info()->data_type();
-
- input->info()->set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
- if(!is_data_type_quantized_per_channel(weights->info()->data_type()))
- {
- const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
- weights->info()->set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
- }
-
- // Merge activation with output stage
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- int32_t min_activation = type_min.get<int32_t>();
- int32_t max_activation = type_max.get<int32_t>();
-
- if(supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
- }
-
- GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = uoqinfo.offset;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
- output_info.is_quantized_per_channel = (weights->info()->data_type() == DataType::QSYMM8_PER_CHANNEL);
- quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
-
- _mm_gemmlowp.configure(input, weights, biases, output, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
-
- // Revert back QuantizatioInfo as input and weights could be used in other convolution layers
- input->info()->set_quantization_info(iqinfo);
- weights->info()->set_quantization_info(wqinfo);
- }
- else
- {
- // Configure matrix multiply function
- _mm_gemm.configure(input, weights, biases, output, 1.0f, 0.0f, gemm_info);
- }
-}
-
-Status NEGEMMConvolutionLayer::validate_mm(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output,
- const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+ : _impl(std::make_unique<Impl>())
{
- const DataType data_type = input->data_type();
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool is_activation_enabled = act_info.enabled();
-
- // Create GEMMInfo structure
- const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
- gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
- false, GEMMLowpOutputStageInfo(), false, false, act_info);
-
- if(is_quantized)
- {
- // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
- // Extract and negate input and weights offset
- const QuantizationInfo &iqinfo = input->quantization_info();
- const QuantizationInfo &wqinfo = weights->quantization_info();
- const QuantizationInfo &oqinfo = (output->total_size() == 0) ? iqinfo : output->quantization_info();
- const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
-
- // Merge activation with output stage
- PixelValue type_min{};
- PixelValue type_max{};
- std::tie(type_min, type_max) = get_min_max(data_type);
- int32_t min_activation = type_min.get<int32_t>();
- int32_t max_activation = type_max.get<int32_t>();
-
- const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
- ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
- ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
- };
- if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
- {
- std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
- }
-
- GEMMLowpOutputStageInfo output_info;
- output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
- output_info.gemmlowp_offset = uoqinfo.offset;
- output_info.gemmlowp_min_bound = min_activation;
- output_info.gemmlowp_max_bound = max_activation;
- output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
- ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
-
- // Perform validation step on GEMMLowp
- std::unique_ptr<ITensorInfo> input_qa = input->clone();
- std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
- input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
- weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
- return NEGEMMLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, output, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
- }
- else
- {
- // Perform validation step on Matrix multiply function
- return NEGEMM::validate(input, weights, nullptr, output, 1.0f, 0.0f, gemm_info);
- }
-}
-
-Status NEGEMMConvolutionLayer::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
-{
- const DataType data_type = input_info->data_type();
- const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
- const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
-
- // Set dummy tensor shapes for the validation
- const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
- const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
- const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
-
- return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
+ _impl->weights_manager = weights_manager;
+ _impl->memory_group = MemoryGroup(memory_manager);
}
+NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_UNUSED(num_groups, weights_info);
- ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConvolutionLayer::validate(input->info(),
- weights->info(),
- biases != nullptr ? biases->info() : nullptr,
- output->info(),
- conv_info,
- weights_info,
- dilation,
- act_info,
- num_groups));
-
- const DataType data_type = input->info()->data_type();
- const DataLayout data_layout = input->info()->data_layout();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->info()->dimension(idx_width);
- const unsigned int kernel_height = weights->info()->dimension(idx_height);
-
- _input = input;
- _is_prepared = weights_info.retain_internal_weights();
- _original_weights = weights;
- _original_output = output;
- _is_quantized = is_data_type_quantized_asymmetric(input->info()->data_type());
- _data_layout = data_layout;
- _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- const ITensor *gemm_input_to_use = input;
- ITensor *gemm_output_to_use = output;
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
- std::tie(conv_w, conv_h) = scaled_dimensions(input->info()->dimension(idx_width),
- input->info()->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- // Check if GEMM3D is supported
- if(data_layout == DataLayout::NHWC)
- {
- _skip_col2im = bool(validate_gemm3d(input->info(), weights->info(), act_info, conv_h, true));
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!_skip_col2im)
- {
- _skip_im2col = false;
- }
- }
- else
- {
- _skip_col2im = false;
- }
-
- // Get parameters from conv_info
- unsigned int stride_x = 0;
- unsigned int stride_y = 0;
- std::tie(stride_x, stride_y) = conv_info.stride();
-
- unsigned int mat_weights_cols = weights->info()->dimension(idx_kernels);
-
- // _weights_reshaped will be auto configured in the kernel.
- // Just append biases and do not transpose 1xW as it will be reshaped in NEGEMM
- const ITensor *weights_to_use = weights;
-
- if(_weights_manager && _weights_manager->are_weights_managed(weights))
- {
- _reshape_weights_managed.configure(weights, nullptr);
- weights_to_use = _weights_manager->acquire(weights, &_reshape_weights_managed);
- }
- else
- {
- _reshape_weights.configure(weights, nullptr, &_weights_reshaped);
- weights_to_use = &_weights_reshaped;
- }
-
- // Create tensor to store im2col reshaped inputs
- if(!_skip_im2col)
- {
- _memory_group.manage(&_im2col_output);
-
- // Configure
- _im2col_kernel = std::make_unique<cpu::kernels::CpuIm2ColKernel>();
- _im2col_kernel->configure(input->info(), _im2col_output.info(), Size2D(kernel_width, kernel_height), conv_info, false, dilation);
-
- // Update GEMM input
- gemm_input_to_use = &_im2col_output;
- }
-
- // Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!_skip_col2im)
- {
- TensorShape shape_gemm;
-
- // Calculate GEMM output shape
- shape_gemm = _im2col_output.info()->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
-
- // FIXME: input->clone() doesn't work with subtensors for grouped convolutions.
- TensorInfo info_gemm(shape_gemm, 1, output_data_type);
- info_gemm.set_quantization_info(output->info()->quantization_info()).set_data_layout(input->info()->data_layout());
- _gemm_output.allocator()->init(info_gemm);
- _gemm_output_3d.allocator()->init(info_gemm);
- _memory_group.manage(&_gemm_output);
-
- // Update GEMM output
- gemm_output_to_use = &_gemm_output;
- }
- else
- {
- TensorInfo out_info{ *output->info() };
- out_info.set_data_type(output_data_type).set_data_layout(input->info()->data_layout()).set_is_resizable(true);
- _gemm_output.allocator()->init(out_info);
- _gemm_output_3d.allocator()->init(out_info);
- _memory_group.manage(&_gemm_output);
-
- // Update GEMM output
- gemm_output_to_use = &_gemm_output_3d;
- }
-
- // Configure GEMM
- // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
- const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
- configure_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, gemm_3d_depth);
-
- if(!_skip_im2col)
- {
- _im2col_output.allocator()->allocate();
- }
-
- if(!_skip_col2im)
- {
- if(_data_layout == DataLayout::NCHW)
- {
- // Configure col2im
- _col2im_kernel = std::make_unique<cpu::kernels::CpuCol2ImKernel>();
- _col2im_kernel->configure(gemm_output_to_use->info(), output->info(), Size2D(conv_w, conv_h));
- }
- else
- {
- // Configure reshape layer
- _reshape_layer.configure(gemm_output_to_use, output);
- }
- }
- else
- {
- // Configure reshape layer
- _reshape_layer.configure(gemm_output_to_use, output);
- }
-
- if(_is_quantized && !_skip_col2im)
- {
- _tmp_output.allocator()->allocate();
- }
-
- _gemm_output.allocator()->allocate();
-
- ARM_COMPUTE_ERROR_ON_MSG((output->info()->dimension(idx_width) != conv_w) || (output->info()->dimension(idx_height) != conv_h),
- "Output shape does not match the expected one");
+ _impl->weights = weights;
+ _impl->op = std::make_unique<cpu::CpuGemmConvolution>();
+ _impl->op->configure(input->info(), weights->info(), (biases != nullptr ? biases->info() : nullptr), output->info(), conv_info, weights_info, dilation, act_info, num_groups);
+
+ _impl->run_pack =
+ {
+ { TensorType::ACL_SRC_0, input },
+ { TensorType::ACL_SRC_1, weights },
+ { TensorType::ACL_SRC_2, biases },
+ { TensorType::ACL_DST, output }
+ };
+ _impl->prep_pack =
+ {
+ { TensorType::ACL_SRC_1, weights },
+ { TensorType::ACL_SRC_2, biases },
+ };
+ _impl->aux_mem_req = _impl->op->workspace();
+ _impl->workspace_tensors = manage_workspace<Tensor>(_impl->aux_mem_req, _impl->memory_group, _impl->run_pack, _impl->prep_pack);
}
Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
{
- ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
- ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
-
- const DataLayout data_layout = input->data_layout();
- const DataType data_type = input->data_type();
- const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
- const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
- const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
- const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
-
- const unsigned int kernel_width = weights->dimension(idx_width);
- const unsigned int kernel_height = weights->dimension(idx_height);
-
- TensorInfo im2col_reshaped_info{};
- TensorInfo info_gemm{};
- TensorInfo tmp_info{};
- TensorInfo weights_reshaped_info{};
- const ITensorInfo *gemm_input_to_use = input;
- const ITensorInfo *gemm_output_to_use = output;
- const ITensorInfo *weights_to_use = weights;
-
- const bool append_bias = false;
- const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
- const bool is_bf16 = data_type == DataType::BFLOAT16;
- bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
-
- // Get convolved dimensions
- unsigned int conv_w = 0;
- unsigned int conv_h = 0;
-
- std::tie(conv_w, conv_h) = scaled_dimensions(input->dimension(idx_width),
- input->dimension(idx_height),
- kernel_width,
- kernel_height,
- conv_info,
- dilation);
-
- // Check if GEMM3D is supported
- bool skip_col2im = false;
- if(data_layout == DataLayout::NHWC)
- {
- skip_col2im = bool(validate_gemm3d(input, weights, act_info, conv_h, true));
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!skip_col2im)
- {
- skip_im2col = false;
- }
- }
-
- if(skip_col2im)
- {
- // If not supported, we need to perform im2col and col2im (or reshape layer)
- if(!bool(validate_gemm3d(input, weights, act_info, conv_h, skip_im2col)))
- {
- skip_im2col = false;
- skip_col2im = false;
- }
- }
-
- ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != input->dimension(idx_channel));
- ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-
- // Validate biases
- if(biases != nullptr)
- {
- if(is_quantized)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
- }
- else if(is_bf16)
- {
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
- }
- else
- {
- ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
- }
- ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
- ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
- }
-
- unsigned int mat_weights_cols = weights->dimension(idx_kernels);
- unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
-
- // Output tensor auto inizialization if not yet initialized
- ARM_COMPUTE_RETURN_ON_ERROR(NEConvolutionLayerReshapeWeights::validate(weights, nullptr, nullptr));
- weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
- weights_reshaped_info.set_quantization_info(weights->quantization_info());
- weights_to_use = &weights_reshaped_info;
-
- if(!skip_im2col)
- {
- // Create tensor info for im2col reshaped inputs
- // For CPU, the batch size is on the fourth dimension
- TensorShape shape_im2col = input->tensor_shape();
- shape_im2col.set(0, mat_weights_rows);
- shape_im2col.set(1, conv_w * conv_h);
- shape_im2col.set(2, 1);
-
- im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
- im2col_reshaped_info.set_quantization_info(input->quantization_info());
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
- gemm_input_to_use = &im2col_reshaped_info;
- }
-
- // Create temporary GEMM output tensor in case we cannot skip col2im
- const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
- if(!skip_col2im)
- {
- TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
- shape_gemm.set(0, mat_weights_cols);
- shape_gemm.set(1, conv_w * conv_h);
- info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
- }
- else
- {
- info_gemm = TensorInfo(output->tensor_shape(), 1, output_data_type);
- }
- info_gemm.set_quantization_info(output->quantization_info()).set_data_layout(input->data_layout());
- gemm_output_to_use = &info_gemm;
- ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
-
- // Validate Col2Im/ReshapeLayer
- if(!skip_col2im && (data_layout == DataLayout::NCHW))
- {
- ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuCol2ImKernel::validate(gemm_output_to_use, output, Size2D(conv_w, conv_h)));
- }
-
- return Status{};
+ return cpu::CpuGemmConvolution::validate(input, weights, biases, output, conv_info, weights_info, dilation, act_info, num_groups);
}
void NEGEMMConvolutionLayer::run()
{
prepare();
-
- MemoryGroupResourceScope scope_mg(_memory_group);
-
- bool out_has_padding = _skip_col2im && (_original_output->info()->padding().bottom != 0 || _original_output->info()->padding().top != 0);
-
- if(!_skip_im2col)
- {
- // Run input reshaping
- unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, _input },
- { TensorType::ACL_DST, &_im2col_output }
- };
- NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
- }
-
- // Handle the case where output has top/bottom padding
- const ITensor *out_to_use = out_has_padding ? &_gemm_output : _original_output;
- _gemm_output_3d.info()->extend_padding(out_to_use->info()->padding());
- _gemm_output_3d.allocator()->import_memory(out_to_use->buffer());
-
- // Runs NEGEMM or NEGEMMLowpMatrixMultiplyCore functions
- if(_is_quantized)
- {
- // Run gemmlowp
- _mm_gemmlowp.run();
- }
- else
- {
- // Run gemm
- _mm_gemm.run();
- }
-
- // Reshape output matrix
- if(!_skip_col2im)
- {
- if(_data_layout == DataLayout::NCHW)
- {
- ITensorPack pack =
- {
- { TensorType::ACL_SRC, &_gemm_output },
- { TensorType::ACL_DST, _original_output }
- };
- NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
- }
- else
- {
- _reshape_layer.run();
- }
- }
- else if(out_has_padding)
- {
- _reshape_layer.run();
- }
-
- _gemm_output_3d.allocator()->free();
+ MemoryGroupResourceScope scope_mg(_impl->memory_group);
+ _impl->op->run(_impl->run_pack);
}
void NEGEMMConvolutionLayer::prepare()
{
- if(!_is_prepared)
+ if(!_impl->is_prepared)
{
- if(_weights_manager && _weights_manager->are_weights_managed(_original_weights))
- {
- _weights_manager->run(_original_weights, &_reshape_weights_managed);
- }
- else
- {
- // Run weights reshaping and mark original weights tensor as unused
- _weights_reshaped.allocator()->allocate();
- _reshape_weights.run();
- _original_weights->mark_as_unused();
- }
+ _impl->op->prepare(_impl->prep_pack);
+ auto has_reshape = std::find_if(_impl->aux_mem_req.begin(),
+ _impl->aux_mem_req.end(),
+ [](const MemoryInfo & m) -> bool { return m.lifetime == MemoryLifetime::Persistent; });
- // Prepare GEMM
- _is_quantized ? _mm_gemmlowp.prepare() : _mm_gemm.prepare();
- if(!_weights_reshaped.is_used())
+ if(has_reshape != std::end(_impl->aux_mem_req))
{
- _weights_reshaped.allocator()->free();
+ _impl->weights->mark_as_unused();
}
-
- _is_prepared = true;
+ for(auto &ws : _impl->workspace_tensors)
+ {
+ const int slot = ws.first;
+ for(auto &m : _impl->aux_mem_req)
+ {
+ if(m.slot == slot && m.lifetime == MemoryLifetime::Prepare)
+ {
+ auto tensor = ws.second.get();
+ tensor->allocator()->free();
+ break;
+ }
+ }
+ }
+ _impl->is_prepared = true;
}
}
} // namespace arm_compute
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.cpp b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
new file mode 100644
index 0000000000..a0424b1c63
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.cpp
@@ -0,0 +1,602 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuGemmConvolution.h"
+
+#include "arm_compute/core/Size2D.h"
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+
+#include "src/core/cpu/kernels/CpuCol2ImKernel.h"
+#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
+#include "src/core/cpu/kernels/CpuReshapeKernel.h"
+#include "src/core/cpu/kernels/CpuWeightsReshapeKernel.h"
+#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/operators/CpuGemm.h"
+#include "src/runtime/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h"
+#include "src/runtime/cpu/operators/CpuGemmLowpOutputStage.h"
+#include "src/runtime/cpu/utils/CpuAuxTensorHandler.h"
+
+#include <set>
+#include <tuple>
+
+using namespace arm_compute::misc::shape_calculator;
+using namespace arm_compute::experimental;
+
+namespace arm_compute
+{
+namespace cpu
+{
+CpuGemmConvolution::CpuGemmConvolution()
+ : _weights_reshape_kernel(nullptr), _im2col_kernel(), _mm_gemm(), _mm_gemmlowp(), _col2im_kernel(), _reshape_kernel(), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(),
+ _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count)
+{
+}
+CpuGemmConvolution::~CpuGemmConvolution() = default;
+
+void CpuGemmConvolution::configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const ActivationLayerInfo &act_info, int gemm_3d_depth)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights);
+ ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, gemm_3d_depth, _skip_im2col));
+
+ // Create GEMMInfo structure
+ const GEMMInfo &gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
+ // Supported activations in GEMM
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+
+ if(_is_quantized)
+ {
+ TensorInfo tmp_src{ *src };
+ TensorInfo tmp_weights{ *weights };
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo iqinfo = src->quantization_info();
+ const QuantizationInfo wqinfo = weights->quantization_info();
+ const QuantizationInfo oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+ const UniformQuantizationInfo uiqinfo = iqinfo.uniform();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+ const DataType data_type = src->data_type();
+
+ tmp_src.set_quantization_info(QuantizationInfo(uiqinfo.scale, -uiqinfo.offset));
+ if(!is_data_type_quantized_per_channel(tmp_weights.data_type()))
+ {
+ const UniformQuantizationInfo uwqinfo = wqinfo.uniform();
+ tmp_weights.set_quantization_info(QuantizationInfo(uwqinfo.scale, -uwqinfo.offset));
+ }
+
+ // Merge activation with output stage
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+ int32_t min_activation = type_min.get<int32_t>();
+ int32_t max_activation = type_max.get<int32_t>();
+
+ if(supported_acts.count(act_info.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ output_info.is_quantized_per_channel = (tmp_weights.data_type() == DataType::QSYMM8_PER_CHANNEL);
+ quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info);
+
+ _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
+ _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false, false, act_info));
+
+ auto mm_mem_req = _mm_gemmlowp->workspace();
+ for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ {
+ _aux_mem[cont] = mm_mem_req[cont];
+ }
+ }
+ else
+ {
+ // Configure matrix multiply function
+ _mm_gemm = std::make_unique<CpuGemm>();
+ _mm_gemm->configure(src, weights, biases, dst, 1.0f, 0.0f, gemm_info);
+ auto mm_mem_req = _mm_gemm->workspace();
+ for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
+ {
+ _aux_mem[cont] = mm_mem_req[cont];
+ }
+ }
+}
+
+Status CpuGemmConvolution::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst,
+ const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+{
+ const DataType data_type = src->data_type();
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool is_activation_enabled = act_info.enabled();
+
+ // Create GEMMInfo structure
+ const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */,
+ gemm_3d_depth, skip_im2col /* Reinterpret the input as 3D if im2col is skipped */,
+ false, GEMMLowpOutputStageInfo(), false, false, act_info);
+
+ if(is_quantized)
+ {
+ // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+ // Extract and negate input and weights offset
+ const QuantizationInfo &iqinfo = src->quantization_info();
+ const QuantizationInfo &wqinfo = weights->quantization_info();
+ const QuantizationInfo &oqinfo = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+ const UniformQuantizationInfo uoqinfo = oqinfo.uniform();
+
+ // Merge activation with output stage
+ PixelValue type_min{};
+ PixelValue type_max{};
+ std::tie(type_min, type_max) = get_min_max(data_type);
+ int32_t min_activation = type_min.get<int32_t>();
+ int32_t max_activation = type_max.get<int32_t>();
+
+ const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+ ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+ ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+ };
+ if(is_activation_enabled && supported_acts.count(act_info.activation()) != 0)
+ {
+ std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act_info, data_type, uoqinfo);
+ }
+
+ GEMMLowpOutputStageInfo output_info;
+ output_info.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+ output_info.gemmlowp_offset = uoqinfo.offset;
+ output_info.gemmlowp_min_bound = min_activation;
+ output_info.gemmlowp_max_bound = max_activation;
+ output_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+ ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, output_info));
+
+ // Perform validation step on GEMMLowp
+ std::unique_ptr<ITensorInfo> input_qa = src->clone();
+ std::unique_ptr<ITensorInfo> weights_qa = weights->clone();
+ input_qa->set_quantization_info(QuantizationInfo(iqinfo.uniform().scale, -iqinfo.uniform().offset));
+ weights_qa->set_quantization_info(QuantizationInfo(wqinfo.uniform().scale, -wqinfo.uniform().offset));
+ return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst, GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false, output_info, false, false, act_info));
+ }
+ else
+ {
+ // Perform validation step on Matrix multiply function
+ return CpuGemm::validate(src, weights, nullptr, dst, 1.0f, 0.0f, gemm_info);
+ }
+}
+
+Status CpuGemmConvolution::validate_gemm3d(const ITensorInfo *input_info, const ITensorInfo *weights_info, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col)
+{
+ const DataType data_type = input_info->data_type();
+ const unsigned int mult_y = skip_im2col ? 1U : gemm_3d_depth;
+ const unsigned int mult_z = skip_im2col ? gemm_3d_depth : 1U;
+
+ // Set dummy tensor shapes for the validation
+ const TensorInfo dummy_input_info(TensorShape(4U, 4U * mult_y, 1U * mult_z), 1, data_type, input_info->quantization_info());
+ const TensorInfo dummy_weights_info(TensorShape(4U, 4U), 1, data_type, weights_info->quantization_info());
+ const TensorInfo dummy_output_info(TensorShape(4U, 4U, gemm_3d_depth), 1, data_type, input_info->quantization_info());
+
+ return validate_mm(&dummy_input_info, &dummy_weights_info, nullptr, &dummy_output_info, act_info, gemm_3d_depth, skip_im2col);
+}
+
+void CpuGemmConvolution::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info,
+ const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_UNUSED(num_groups, weights_info);
+ ARM_COMPUTE_ERROR_THROW_ON(CpuGemmConvolution::validate(src,
+ weights,
+ biases,
+ dst,
+ conv_info,
+ weights_info,
+ dilation,
+ act_info,
+ num_groups));
+
+ const DataType data_type = src->data_type();
+ const DataLayout data_layout = src->data_layout();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const unsigned int kernel_width = weights->dimension(idx_width);
+ const unsigned int kernel_height = weights->dimension(idx_height);
+
+ _is_prepared = weights_info.retain_internal_weights();
+ _is_quantized = is_data_type_quantized_asymmetric(src->data_type());
+ _data_layout = data_layout;
+ _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+ const ITensorInfo *gemm_input_to_use = src;
+ ITensorInfo *gemm_output_to_use = dst;
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv_info,
+ dilation);
+ ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h),
+ "Output shape does not match the expected one");
+
+ // Check if GEMM3D is supported
+ if(data_layout == DataLayout::NHWC)
+ {
+ _skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!_skip_col2im)
+ {
+ _skip_im2col = false;
+ }
+ }
+ else
+ {
+ _skip_col2im = false;
+ }
+
+ // Get parameters from conv_info
+ unsigned int stride_x = 0;
+ unsigned int stride_y = 0;
+ std::tie(stride_x, stride_y) = conv_info.stride();
+
+ unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+
+ // _weights_reshaped will be auto configured in the kernel.
+ // Just append biases and do not transpose 1xW as it will be reshaped in CpuGemm
+ _weights_reshape_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
+ _weights_reshape_kernel->configure(weights, nullptr, &_weights_reshaped);
+ _weights_reshaped.set_quantization_info(weights->quantization_info());
+
+ // Create tensor to store im2col reshaped inputs
+ if(!_skip_im2col)
+ {
+ // Configure
+ _im2col_kernel = std::make_unique<kernels::CpuIm2ColKernel>();
+ _im2col_kernel->configure(src, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
+
+ // Update GEMM input
+ gemm_input_to_use = &_im2col_output;
+ }
+
+ // Create temporary GEMM output tensor in case we cannot skip col2im
+ const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+ if(!_skip_col2im)
+ {
+ TensorShape shape_gemm;
+
+ // Calculate GEMM output shape
+ shape_gemm = _im2col_output.tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+
+ _gemm_output = TensorInfo(shape_gemm, 1, output_data_type);
+ _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+ _gemm_output_3d = TensorInfo(_gemm_output);
+
+ // Update GEMM output
+ gemm_output_to_use = &_gemm_output;
+ }
+ else
+ {
+ _gemm_output_3d = TensorInfo(*dst);
+ _gemm_output_3d.set_data_type(output_data_type).set_data_layout(src->data_layout()).set_is_resizable(true);
+ _gemm_output = TensorInfo(_gemm_output_3d);
+
+ // Update GEMM output
+ gemm_output_to_use = &_gemm_output_3d;
+ }
+
+ // Configure GEMM
+ // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
+ const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
+ configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, gemm_3d_depth);
+
+ if(!_skip_col2im && _data_layout == DataLayout::NCHW)
+ {
+ // Configure col2im
+ _col2im_kernel = std::make_unique<kernels::CpuCol2ImKernel>();
+ _col2im_kernel->configure(gemm_output_to_use, dst, Size2D(conv_w, conv_h));
+ }
+ else
+ {
+ // Configure reshape layer
+ _reshape_kernel = std::make_unique<kernels::CpuReshapeKernel>();
+ _reshape_kernel->configure(gemm_output_to_use, dst);
+ }
+
+ _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
+ _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Prepare, _weights_reshaped.total_size());
+ _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
+ _aux_mem[GemmOutput3d] = MemoryInfo(offset_int_vec(GemmOutput3d), MemoryLifetime::Temporary, _gemm_output_3d.total_size());
+}
+
+Status CpuGemmConvolution::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info, const Size2D &dilation, const ActivationLayerInfo &act_info, unsigned int num_groups)
+{
+ ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!");
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG(num_groups > 1, "Grouping (num_groups != 1) is not supported");
+
+ const DataLayout data_layout = src->data_layout();
+ const DataType data_type = src->data_type();
+ const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
+ const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
+ const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+ const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
+
+ const unsigned int kernel_width = weights->dimension(idx_width);
+ const unsigned int kernel_height = weights->dimension(idx_height);
+
+ TensorInfo im2col_reshaped_info{};
+ TensorInfo info_gemm{};
+ TensorInfo tmp_info{};
+ TensorInfo weights_reshaped_info{};
+ const ITensorInfo *gemm_input_to_use = src;
+ const ITensorInfo *gemm_output_to_use = dst;
+ const ITensorInfo *weights_to_use = weights;
+
+ const bool append_bias = false;
+ const bool is_quantized = is_data_type_quantized_asymmetric(data_type);
+ const bool is_bf16 = data_type == DataType::BFLOAT16;
+ bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv_info.stride().first == 1 && conv_info.stride().second == 1);
+
+ // Get convolved dimensions
+ unsigned int conv_w = 0;
+ unsigned int conv_h = 0;
+
+ std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width),
+ src->dimension(idx_height),
+ kernel_width,
+ kernel_height,
+ conv_info,
+ dilation);
+
+ // Check if GEMM3D is supported
+ bool skip_col2im = false;
+ if(data_layout == DataLayout::NHWC)
+ {
+ skip_col2im = bool(validate_gemm3d(src, weights, act_info, conv_h, true));
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!skip_col2im)
+ {
+ skip_im2col = false;
+ }
+ }
+
+ if(skip_col2im)
+ {
+ // If not supported, we need to perform im2col and col2im (or reshape layer)
+ if(!bool(validate_gemm3d(src, weights, act_info, conv_h, skip_im2col)))
+ {
+ skip_im2col = false;
+ skip_col2im = false;
+ }
+ }
+
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->dimension(idx_channel) != src->dimension(idx_channel));
+ ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+
+ // Validate biases
+ if(biases != nullptr)
+ {
+ if(is_quantized)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+ }
+ else if(is_bf16)
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+ }
+ else
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+ }
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels));
+ ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+ }
+
+ unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+ unsigned int mat_weights_rows = weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
+
+ weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, data_type);
+ weights_reshaped_info.set_quantization_info(weights->quantization_info());
+ weights_to_use = &weights_reshaped_info;
+
+ if(!skip_im2col)
+ {
+ // Create tensor info for im2col reshaped inputs
+ // For CPU, the batch size is on the fourth dimension
+ TensorShape shape_im2col = src->tensor_shape();
+ shape_im2col.set(0, mat_weights_rows);
+ shape_im2col.set(1, conv_w * conv_h);
+ shape_im2col.set(2, 1);
+
+ im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
+ im2col_reshaped_info.set_quantization_info(src->quantization_info());
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuIm2ColKernel::validate(src, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+ gemm_input_to_use = &im2col_reshaped_info;
+ }
+
+ // Create temporary GEMM output tensor in case we cannot skip col2im
+ const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
+ if(!skip_col2im)
+ {
+ TensorShape shape_gemm = gemm_input_to_use->tensor_shape();
+ shape_gemm.set(0, mat_weights_cols);
+ shape_gemm.set(1, conv_w * conv_h);
+ info_gemm = TensorInfo(shape_gemm, 1, output_data_type);
+ }
+ else
+ {
+ info_gemm = TensorInfo(dst->tensor_shape(), 1, output_data_type);
+ }
+ info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout());
+ gemm_output_to_use = &info_gemm;
+ ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info, skip_col2im ? conv_h : 0, skip_im2col));
+
+ // Validate Col2Im/ReshapeLayer
+ if(!skip_col2im && (data_layout == DataLayout::NCHW))
+ {
+ ARM_COMPUTE_RETURN_ON_ERROR(kernels::CpuCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h)));
+ }
+
+ return Status{};
+}
+
+void CpuGemmConvolution::run(ITensorPack &tensors)
+{
+ prepare(tensors);
+
+ auto src = tensors.get_const_tensor(ACL_SRC_0);
+ auto weights = tensors.get_const_tensor(ACL_SRC_1);
+ auto biases = tensors.get_const_tensor(ACL_SRC_2);
+ auto dst = tensors.get_tensor(ACL_DST);
+ auto gemm_input_to_use = src;
+
+ CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
+ CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
+
+ bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
+ if(!_skip_im2col)
+ {
+ // Run input reshaping
+ unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, src },
+ { TensorType::ACL_DST, im2col_output.get() }
+ };
+ NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
+ gemm_input_to_use = im2col_output.get();
+ }
+
+ // Handle the case where output has top/bottom padding
+ const ITensor *out_to_use = out_has_padding ? gemm_output.get() : dst;
+ _gemm_output_3d.extend_padding(out_to_use->info()->padding());
+ CpuAuxTensorHandler gemm_output_3d(offset_int_vec(GemmOutput3d), _gemm_output_3d, tensors, true);
+ auto gemm_output_to_use = gemm_output.get();
+ if(_skip_im2col)
+ {
+ gemm_output_to_use = gemm_output_3d.get();
+ }
+ if(_skip_col2im && !out_has_padding)
+ {
+ gemm_output_to_use = dst;
+ }
+
+ // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
+ ITensorPack pack_mm =
+ {
+ { TensorType::ACL_SRC_0, gemm_input_to_use },
+ { TensorType::ACL_SRC_1, weights },
+ { TensorType::ACL_SRC_2, biases },
+ { TensorType::ACL_DST, gemm_output_to_use }
+ };
+ if(_is_quantized)
+ {
+ // Run gemmlowp
+ _mm_gemmlowp->run(pack_mm);
+ }
+ else
+ {
+ // Run gemm
+ _mm_gemm->run(pack_mm);
+ }
+
+ // Reshape output matrix
+ if(!_skip_col2im)
+ {
+ if(_data_layout == DataLayout::NCHW)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, gemm_output.get() },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_col2im_kernel.get(), Window::DimY, _col2im_kernel->window(), pack);
+ }
+ else
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, gemm_output_to_use },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+ }
+ }
+ else if(out_has_padding)
+ {
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, gemm_output_to_use },
+ { TensorType::ACL_DST, dst }
+ };
+ NEScheduler::get().schedule_op(_reshape_kernel.get(), Window::DimY, _reshape_kernel->window(), pack);
+ }
+}
+
+void CpuGemmConvolution::prepare(ITensorPack &tensors)
+{
+ if(!_is_prepared)
+ {
+ // Run weights reshaping and mark original weights tensor as unused
+ ITensor *weights_reshaped_p = utils::cast::polymorphic_downcast<ITensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped)));
+ CpuAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p);
+ auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+ ITensorPack pack =
+ {
+ { TensorType::ACL_SRC, weights },
+ { TensorType::ACL_DST, weights_reshaped.get() }
+ };
+ NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
+ tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+
+ // Prepare GEMM
+ _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
+ _is_prepared = true;
+ }
+}
+experimental::MemoryRequirements CpuGemmConvolution::workspace() const
+{
+ return _aux_mem;
+}
+} // namespace cpu
+} // namespace arm_compute \ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmConvolution.h b/src/runtime/cpu/operators/CpuGemmConvolution.h
new file mode 100644
index 0000000000..8b41cb4a91
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmConvolution.h
@@ -0,0 +1,197 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMMCONVOLUTION_H
+#define ARM_COMPUTE_CPU_GEMMCONVOLUTION_H
+
+#include "arm_compute/core/TensorInfo.h"
+#include "arm_compute/core/Types.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+namespace cpu
+{
+class CpuGemm;
+class CpuGemmLowpMatrixMultiplyCore;
+class CpuGemmLowpOutputStage;
+namespace kernels
+{
+class CpuWeightsReshapeKernel;
+class CpuIm2ColKernel;
+class CpuCol2ImKernel;
+class CpuReshapeKernel;
+} // namespace kernels
+
+/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
+ *
+ * -# @ref cpu::kernels::CpuIm2ColKernel
+ * -# @ref CpuGemm (if the data type is BFLOAT16/FP16/FP32)
+ * -# @ref CpuGemmLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref CpuGemmLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
+ * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout)
+ * -# @ref kernels::CpuWeightsReshapeKernel
+ *
+ */
+class CpuGemmConvolution : public ICpuOperator
+{
+public:
+ /** Constructor */
+ CpuGemmConvolution();
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuGemmConvolution(const CpuGemmConvolution &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ CpuGemmConvolution(CpuGemmConvolution &&) = delete;
+ /** Prevent instances of this class from being copied (As this class contains pointers) */
+ CpuGemmConvolution &operator=(const CpuGemmConvolution &) = delete;
+ /** Prevent instances of this class from being moved (As this class contains non movable objects) */
+ CpuGemmConvolution &operator=(CpuGemmConvolution &&) = delete;
+ /** Destructor */
+ ~CpuGemmConvolution();
+ /** Set the input and output tensors.
+ *
+ * Valid data layouts:
+ * - NHWC
+ * - NCHW
+ *
+ * Valid data type configurations:
+ * |src0 |src1 |src2 |dst |
+ * |:--------------|:------------------|:--------|:--------------|
+ * |F16 |F16 |F16 |F16 |
+ * |F32 |F32 |F32 |F32 |
+ * |BFLOAT16 |BFLOAT16 |BFLOAT16 |BFLOAT16 |
+ * |QASYMM8 |QASYMM8 |S32 |QASYMM8 |
+ * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 |
+ * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED |
+ * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED |
+ *
+ * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+ * while every optional dimension from 4 and above represent a batch of inputs.
+ * Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+ * Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+ * Data types supported: Same as @p input.
+ * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo.
+ * @param[in] weights_info Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+ * tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
+ * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+ */
+ void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, const WeightsInfo &weights_info = WeightsInfo(),
+ const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+ /** Static function to check if given info will lead to a valid configuration
+ *
+ * Similar to CpuGemmConvolution::configure()
+ *
+ * @return a status
+ */
+ static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const PadStrideInfo &conv_info,
+ const WeightsInfo &weights_info = WeightsInfo(), const Size2D &dilation = Size2D(1U, 1U), const ActivationLayerInfo &act_info = ActivationLayerInfo(), unsigned int num_groups = 1);
+
+ // Inherited methods overridden:
+ void run(ITensorPack &tensors) override;
+ void prepare(ITensorPack &tensors) override;
+ experimental::MemoryRequirements workspace() const override;
+
+private:
+ /** Configures the appropriate matrix multiply routine
+ *
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[out] dst Output tensor info. Data types supported: Same as @p input,
+ * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+ */
+ void configure_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *output, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ int gemm_3d_depth = 1);
+ /** Static function to check if given info will lead to a valid configuration of @ref NEGEMMConvolutionLayer matrix multiply routines
+ *
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+ * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+ * Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+ * @param[in] dst Output tensor info. Data types supported: Same as @p input,
+ * except for input of QASYMM8/QASYMM8_SIGNED type where output should be of S32 type.
+ * @param[in] act_info (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] gemm_3d_depth (Optional) Depth of GEMM 3D (Defaults to 1)
+ * @param[in] skip_im2col (Optional) Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. (Default to false)
+ *
+ * @return a status
+ */
+ static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo(),
+ int gemm_3d_depth = 1, bool skip_im2col = false);
+ /** Static function to check if GEMM3D is supported in @ref NEGEMM or in @ref CpuGemmMLowpMatrixMultiplyCore
+ *
+ * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] weights Weights tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+ * @param[in] act_info Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
+ * @param[in] gemm_3d_depth Depth of GEMM 3D
+ * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout
+ *
+ * @return a status
+ */
+ static Status validate_gemm3d(const ITensorInfo *src, const ITensorInfo *weights, const ActivationLayerInfo &act_info, int gemm_3d_depth, bool skip_im2col);
+
+ enum AuxTensorIdx
+ {
+ // CpuGemmLowpMatrixMultiplyCore has up to 8 internal tensors
+ Im2ColOutput = 9,
+ WeightsReshaped,
+ GemmOutput,
+ GemmOutput3d,
+ Count
+ };
+
+ std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_kernel;
+ std::unique_ptr<cpu::kernels::CpuIm2ColKernel> _im2col_kernel;
+ std::unique_ptr<CpuGemm> _mm_gemm;
+ std::unique_ptr<CpuGemmLowpMatrixMultiplyCore> _mm_gemmlowp;
+ std::unique_ptr<kernels::CpuCol2ImKernel> _col2im_kernel;
+ std::unique_ptr<kernels::CpuReshapeKernel> _reshape_kernel;
+
+ TensorInfo _im2col_output;
+ TensorInfo _weights_reshaped;
+ TensorInfo _gemm_output;
+ TensorInfo _gemm_output_3d;
+
+ DataLayout _data_layout;
+
+ bool _skip_im2col;
+ bool _skip_col2im;
+ bool _is_quantized;
+ bool _is_prepared;
+
+ experimental::MemoryRequirements _aux_mem{ Count };
+};
+} // namespace cpu
+} // namespace arm_compute
+#endif /* ARM_COMPUTE_CPU_GEMMCONVOLUTION_H */
diff --git a/tests/validation/NEON/ConvolutionLayer.cpp b/tests/validation/NEON/ConvolutionLayer.cpp
index 04ecb03077..4332db605d 100644
--- a/tests/validation/NEON/ConvolutionLayer.cpp
+++ b/tests/validation/NEON/ConvolutionLayer.cpp
@@ -29,6 +29,7 @@
#include "arm_compute/runtime/Tensor.h"
#include "arm_compute/runtime/TensorAllocator.h"
#include "src/core/helpers/MemoryHelpers.h"
+#include "src/runtime/cpu/operators/CpuGemmConvolution.h"
#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
#include "src/runtime/cpu/operators/CpuWinogradConv2d.h"
#include "tests/NEON/Accessor.h"
@@ -509,6 +510,101 @@ using NEGEMMConvolutionLayerFixture = ConvolutionValidationFixture<Tensor, Acces
template <typename T>
using NEGEMMConvolutionLayerMixedDataLayoutFixture = ConvolutionValidationFixture<Tensor, Accessor, NEConvolutionLayer, T, true>;
+/** Test case for memory injection in @ref cpu::CpuGemmConvolution.
+ *
+ * Configure the operator once and inject memory at run-time in multiple executions.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MemoryInjection, framework::DatasetMode::ALL)
+{
+ auto conv = std::make_unique<cpu::CpuGemmConvolution>();
+ const auto src_info = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+ const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+ const auto bias_info = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+ auto dst_info = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+ const auto conv_info = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+ WeightsInfo weights_info(false, 3U, 3U, 1U);
+ conv->configure(&src_info, &weight_info, &bias_info, &dst_info, conv_info, weights_info);
+
+ // tensors are newly created every call of this lambda function
+ auto src = create_tensor<Tensor>(src_info);
+ auto weight = create_tensor<Tensor>(weight_info);
+ auto bias = create_tensor<Tensor>(bias_info);
+ src.allocator()->allocate();
+ weight.allocator()->allocate();
+ bias.allocator()->allocate();
+
+ ITensorPack run_pack{ { TensorType::ACL_SRC_0, &src }, { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+ ITensorPack prep_pack{ { TensorType::ACL_SRC_1, &weight }, { TensorType::ACL_SRC_2, &bias } };
+
+ auto mg = MemoryGroup{};
+ auto ws = manage_workspace<Tensor>(conv->workspace(), mg, run_pack, prep_pack);
+
+ auto run_conv = [&]() -> Tensor
+ {
+ auto dst = create_tensor<Tensor>(dst_info);
+ dst.allocator()->allocate();
+ run_pack.add_tensor(TensorType::ACL_DST, &dst);
+
+ library->fill_tensor_value(Accessor(src), 1.f);
+ library->fill_tensor_value(Accessor(weight), 2.f);
+ library->fill_tensor_value(Accessor(bias), 3.f);
+ // This operator is configured once and captured by this lambda.
+ conv->prepare(prep_pack);
+ conv->run(run_pack);
+ return dst;
+ };
+ auto result_0 = run_conv();
+ auto result_1 = run_conv();
+ for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+ {
+ ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+ }
+}
+
+/** Test case for memory injection in @ref NEGEMMConvolutionLayer.
+ *
+ * Make sure @ref NEGEMMConvolutionLayer still works through injecting the memory at configure time using the old API.
+ *
+ * Checks performed in order:
+ * - Both runs compute the same output
+ */
+TEST_CASE(MultipleExecutionWithConfigure, framework::DatasetMode::ALL)
+{
+ auto conv = std::make_unique<NEGEMMConvolutionLayer>();
+ const auto src_info = TensorInfo(TensorShape(1U, 5U, 2U), 1, DataType::F32, DataLayout::NCHW);
+ const auto weight_info = TensorInfo(TensorShape(1U, 3U, 2U, 3U), 1, DataType::F32, DataLayout::NCHW);
+ const auto bias_info = TensorInfo(TensorShape(3U), 1, DataType::F32, DataLayout::NCHW);
+ auto dst_info = TensorInfo(TensorShape(1U, 7U, 3U), 1, DataType::F32, DataLayout::NCHW);
+ const auto conv_info = PadStrideInfo(1, 1, 0, 0, 2, 2, DimensionRoundingType::FLOOR);
+ WeightsInfo weights_info(false, 3U, 3U, 1U);
+ auto run_conv = [&]()
+ {
+ auto src = create_tensor<Tensor>(src_info);
+ auto weight = create_tensor<Tensor>(weight_info);
+ auto bias = create_tensor<Tensor>(bias_info);
+ auto dst = create_tensor<Tensor>(dst_info);
+ conv->configure(&src, &weight, &bias, &dst, conv_info, weights_info);
+ src.allocator()->allocate();
+ weight.allocator()->allocate();
+ bias.allocator()->allocate();
+ dst.allocator()->allocate();
+ library->fill_tensor_value(Accessor(src), 1.f);
+ library->fill_tensor_value(Accessor(weight), 2.f);
+ library->fill_tensor_value(Accessor(bias), 3.f);
+ conv->run();
+ return dst;
+ };
+ auto result_0 = run_conv();
+ auto result_1 = run_conv();
+ for(size_t i = 0; i < result_0.info()->tensor_shape().total_size(); ++i)
+ {
+ ARM_COMPUTE_EXPECT(((float *)result_0.buffer())[i] == ((float *)result_1.buffer())[i], framework::LogLevel::ERRORS);
+ }
+}
+
TEST_SUITE(Float)
#if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
TEST_SUITE(BFLOAT16)