Port NEIm2ColKernel

Resolves: COMPMID-4510 Change-Id: Ia3e588f599449d975dabad4afafb2974dd44d0ad Signed-off-by: Manuel Bottini <manuel.bottini@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5899 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Manuel Bottini <manuel.bottini@arm.com> 2021-06-30 18:29:18 +0100
committer: Manuel Bottini <manuel.bottini@arm.com> 2021-07-06 11:03:31 +0000
commit: 900289936c458eff95499e0a0eaba989a27aaa4d (patch)
tree: 305853a38fd66842d19aa1a2d1cad88a70b946bc
parent: 6132c7aeaf6230a4e8b074309327762a9e4be003 (diff)
download: ComputeLibrary-900289936c458eff95499e0a0eaba989a27aaa4d.tar.gz
13 files changed, 250 insertions, 170 deletions
diff --git a/Android.bp b/Android.bp
index 670f0697d7..621d013e8b 100644
--- a/Android.bp
+++ b/Android.bp
@@ -159,7 +159,6 @@ cc_library_static {
         "src/core/NEON/kernels/NEGEMMLowpReductionKernel.cpp",
         "src/core/NEON/kernels/NEGatherKernel.cpp",
         "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.cpp",
-        "src/core/NEON/kernels/NEIm2ColKernel.cpp",
         "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.cpp",
         "src/core/NEON/kernels/NEL2NormalizeLayerKernel.cpp",
         "src/core/NEON/kernels/NELogicalKernel.cpp",
@@ -275,6 +274,7 @@ cc_library_static {
         "src/core/cpu/kernels/CpuGemmMatrixAdditionKernel.cpp",
         "src/core/cpu/kernels/CpuGemmMatrixMultiplyKernel.cpp",
         "src/core/cpu/kernels/CpuGemmTranspose1xWKernel.cpp",
+        "src/core/cpu/kernels/CpuIm2ColKernel.cpp",
         "src/core/cpu/kernels/CpuMulKernel.cpp",
         "src/core/cpu/kernels/CpuPermuteKernel.cpp",
         "src/core/cpu/kernels/CpuPool2dKernel.cpp",
diff --git a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
index e409a61ba1..43f1d4cc05 100644
--- a/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEFullyConnectedLayer.h
@@ -77,7 +77,7 @@ private:
 } // namespace weights_transformations
 
 /** Basic function to compute a Fully Connected layer. This function calls the following kernels:
- *  -# @ref NEIm2ColKernel (called when the input comes from a convolutional layer)
+ *  -# @ref cpu::kernels::CpuIm2ColKernel (called when the input comes from a convolutional layer)
  *  -# @ref NETranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once)
  *  -# @ref NEGEMM or @ref NEGEMMLowpMatrixMultiplyCore (if quantized asymmetric)
  *  -# @ref cpu::kernels::CpuGemmMatrixAdditionKernel or @ref NEGEMMLowpOutputStage (if quantized asymmetric) (if @p biases is not equal to nullptr)
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
index d334d518e2..655d733bd1 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConvolutionLayer.h
@@ -41,8 +41,14 @@ namespace arm_compute
 {
 class ITensor;
 class NECol2ImKernel;
-class NEIm2ColKernel;
 class NEWeightsReshapeKernel;
+namespace cpu
+{
+namespace kernels
+{
+class CpuIm2ColKernel;
+} // namespace kernels
+} // namespace cpu
 
 /** Function to reshape the weights. This function calls the following kernel:
  * -# @ref NEWeightsReshapeKernel
@@ -152,7 +158,7 @@ private:
 
 /** Basic function to compute the convolution layer. This function calls the following kernels/functions:
  *
- * -# @ref NEIm2ColKernel
+ * -# @ref cpu::kernels::CpuIm2ColKernel
  * -# @ref NEGEMM (if the data type is BFLOAT16/FP16/FP32)
  * -# @ref NEGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
  * -# @ref NEGEMMLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
@@ -283,12 +289,13 @@ private:
     IWeightsManager                                                   *_weights_manager;
     NEConvolutionLayerReshapeWeights                                   _reshape_weights;
     weights_transformations::NEConvolutionLayerReshapeWeightsTransform _reshape_weights_managed;
-    std::unique_ptr<NEIm2ColKernel>                                    _im2col_kernel;
+    std::unique_ptr<cpu::kernels::CpuIm2ColKernel>                     _im2col_kernel;
     NEGEMM                                                             _mm_gemm;
     NEGEMMLowpMatrixMultiplyCore                                       _mm_gemmlowp;
     std::unique_ptr<NECol2ImKernel>                                    _col2im_kernel;
     NEReshapeLayer                                                     _reshape_layer;
 
+    const ITensor *_input;
     const ITensor *_original_weights;
     const ITensor *_original_output;
 
diff --git a/docs/user_guide/release_version_and_change_log.dox b/docs/user_guide/release_version_and_change_log.dox
index 0c8b57ff9f..78c13041ee 100644
--- a/docs/user_guide/release_version_and_change_log.dox
+++ b/docs/user_guide/release_version_and_change_log.dox
@@ -585,7 +585,7 @@ v20.05 Public major release
  - Added Bfloat16 support in:
      - @ref NEWeightsReshapeKernel
      - @ref NEConvolutionLayerReshapeWeights
-     - @ref NEIm2ColKernel
+     - NEIm2ColKernel
      - NEIm2Col
      - NEDepthConvertLayerKernel
      - @ref NEDepthConvertLayer
@@ -1362,7 +1362,7 @@ v17.03.1 First Major public release of the sources
    - @ref NENormalizationLayerKernel / @ref NENormalizationLayer
    - NETransposeKernel / @ref NETranspose
    - NELogits1DMaxKernel, NELogits1DShiftExpSumKernel, NELogits1DNormKernel / @ref NESoftmaxLayer
-   - @ref NEIm2ColKernel, @ref NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer
+   - NEIm2ColKernel, @ref NECol2ImKernel, NEConvolutionLayerWeightsReshapeKernel / @ref NEConvolutionLayer
    - NEGEMMMatrixAccumulateBiasesKernel / @ref NEFullyConnectedLayer
    - @ref NEGEMMLowpMatrixMultiplyKernel / NEGEMMLowp
 
diff --git a/filelist.json b/filelist.json
index 7512ac12bd..9562cc7115 100644
--- a/filelist.json
+++ b/filelist.json
@@ -1330,7 +1330,7 @@
       "Im2Col": {
         "files": {
           "kernel": [
-            "src/core/NEON/kernels/NEIm2ColKernel.cpp"
+            "src/core/cpu/kernels/CpuIm2ColKernel.cpp"
           ]
         }
       },
diff --git a/src/core/NEON/NEKernels.h b/src/core/NEON/NEKernels.h
index 665c8c7fba..69c8d7bebc 100644
--- a/src/core/NEON/NEKernels.h
+++ b/src/core/NEON/NEKernels.h
@@ -47,7 +47,6 @@
 #include "src/core/NEON/kernels/NEGEMMLowpReductionKernel.h"
 #include "src/core/NEON/kernels/NEGatherKernel.h"
 #include "src/core/NEON/kernels/NEGenerateProposalsLayerKernel.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
 #include "src/core/NEON/kernels/NEInstanceNormalizationLayerKernel.h"
 #include "src/core/NEON/kernels/NEL2NormalizeLayerKernel.h"
 #include "src/core/NEON/kernels/NELogicalKernel.h"
diff --git a/src/core/NEON/kernels/NECol2ImKernel.h b/src/core/NEON/kernels/NECol2ImKernel.h
index 397bf5ab17..1976302036 100644
--- a/src/core/NEON/kernels/NECol2ImKernel.h
+++ b/src/core/NEON/kernels/NECol2ImKernel.h
@@ -34,7 +34,7 @@ class ITensor;
 
 /** Kernel to perform col2im reshaping.
  *
- * Rearranges each matrix column into image blocks. It's the inverse operation of @ref NEIm2ColKernel.
+ * Rearranges each matrix column into image blocks. It's the inverse operation of @ref cpu::kernels::CpuIm2ColKernel.
  *
  * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3:
  *
diff --git a/src/core/NEON/kernels/NEWeightsReshapeKernel.h b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
index 76eca9fe86..5701c84cac 100644
--- a/src/core/NEON/kernels/NEWeightsReshapeKernel.h
+++ b/src/core/NEON/kernels/NEWeightsReshapeKernel.h
@@ -33,7 +33,7 @@ class ITensor;
 /** Kernel to perform reshaping on the weights used by convolution and locally connected layer
  *
  * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels.
- * In combination with the @ref NEIm2ColKernel can transform a convolution to a matrix multiplication.
+ * In combination with the @ref cpu::kernels::CpuIm2ColKernel can transform a convolution to a matrix multiplication.
  *
  * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have:
  * @f[
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.cpp b/src/core/cpu/kernels/CpuIm2ColKernel.cpp
index a28a77a4fb..a5dbcc29c8 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.cpp
+++ b/src/core/cpu/kernels/CpuIm2ColKernel.cpp
@@ -21,7 +21,7 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
 
 #include "arm_compute/core/Error.h"
 #include "arm_compute/core/Helpers.h"
@@ -42,9 +42,13 @@
 #include <cstring>
 #include <tuple>
 
-using namespace arm_compute;
+namespace arm_compute
+{
 using namespace misc::shape_calculator;
-
+namespace cpu
+{
+namespace kernels
+{
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
@@ -75,33 +79,6 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c
     return Status{};
 }
 
-std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                                        bool has_bias, const Size2D &dilation)
-{
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output tensor auto initialization if not yet initialized
-    auto_init_if_empty(*output, input->clone()->set_tensor_shape(compute_im2col_conv_shape(input, kernel_dims, conv_info, has_bias, dilation, false)));
-
-    const DataLayout   data_layout = input->data_layout();
-    const unsigned int width_idx   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx  = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT);
-    const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
-
-    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input->dimension(width_idx), input->dimension(height_idx),
-                                                                             kernel_dims.width, kernel_dims.height,
-                                                                             conv_info, dilation);
-
-    Window win = calculate_max_window(*input, Steps());
-    win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
-    win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
-    win.set(channel_idx, Window::Dimension(0, 1, 1));
-
-    // The NEIm2ColKernel doesn't need padding so update_window_and_padding() can be skipped
-
-    return std::make_pair(Status{}, win);
-}
-
 template <typename T, bool has_pads>
 inline void linearize_volume_nchw(const uint8_t *const in_ptr,
                                   T                   *out_ptr,
@@ -272,26 +249,26 @@ inline void linearize_volume_nhwc(const uint8_t *const in_ptr,
 } // namespace
 
 template <typename T, bool has_pads, bool is_nchw>
-void NEIm2ColKernel::run_im2col(const Window &window)
+void CpuIm2ColKernel::run_im2col(const ITensor *src, ITensor *dst, const Window &window)
 {
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
 
     const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
     const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
     const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
 
-    const int input_w        = _input->info()->dimension(width_idx);
-    const int input_h        = _input->info()->dimension(height_idx);
-    const int input_c        = _input->info()->dimension(channel_idx);
-    const int input_stride_x = _input->info()->strides_in_bytes().x();
-    const int input_stride_y = _input->info()->strides_in_bytes().y();
-    const int input_stride_z = _input->info()->strides_in_bytes().z();
+    const int input_w        = src->info()->dimension(width_idx);
+    const int input_h        = src->info()->dimension(height_idx);
+    const int input_c        = src->info()->dimension(channel_idx);
+    const int input_stride_x = src->info()->strides_in_bytes().x();
+    const int input_stride_y = src->info()->strides_in_bytes().y();
+    const int input_stride_z = src->info()->strides_in_bytes().z();
     const int pad_left       = _conv_info.pad_left();
     const int pad_top        = _conv_info.pad_top();
     const int stride_x       = _conv_info.stride().first;
     const int stride_y       = _conv_info.stride().second;
-    const int pad_value      = is_data_type_quantized(_input->info()->data_type()) ? _input->info()->quantization_info().uniform().offset : 0;
+    const int pad_value      = is_data_type_quantized(src->info()->data_type()) ? src->info()->quantization_info().uniform().offset : 0;
 
     Window window_in_out(window);
     // The first three dimensions of the input and output are increased by the inner loops
@@ -300,8 +277,8 @@ void NEIm2ColKernel::run_im2col(const Window &window)
     window_in_out.set(Window::DimZ, Window::Dimension(0, 0, 0));
 
     // Create iterators
-    Iterator in(_input, window_in_out);
-    Iterator out(_output, window_in_out);
+    Iterator in(src, window_in_out);
+    Iterator out(dst, window_in_out);
 
     execute_window_loop(window, [&](const Coordinates & id)
     {
@@ -310,7 +287,7 @@ void NEIm2ColKernel::run_im2col(const Window &window)
 
         // Get pointers
         const uint8_t *const input_ptr  = in.ptr();
-        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * _output->info()->strides_in_bytes().y());
+        auto                 output_ptr = reinterpret_cast<T *>(out.ptr() + (id[width_idx] + id[height_idx] * _convolved_dims.first) * dst->info()->strides_in_bytes().y());
 
         // Linearize volume
         if(is_nchw)
@@ -354,53 +331,47 @@ void NEIm2ColKernel::run_im2col(const Window &window)
     in, out);
 }
 
-NEIm2ColKernel::NEIm2ColKernel()
-    : _func(), _input(nullptr), _output(nullptr), _convolved_dims(), _conv_info(), _kernel_width(0), _kernel_height(0), _has_bias(false), _dilation(1U, 1U), _data_layout(DataLayout::UNKNOWN)
-{
-}
-
-void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                               bool has_bias, const Size2D &dilation, unsigned int num_groups)
+void CpuIm2ColKernel::configure(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                                bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation, num_groups));
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
     ARM_COMPUTE_UNUSED(num_groups);
 
-    _data_layout                  = input->info()->data_layout();
-    const unsigned int width_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
-    const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    _data_layout                   = src->data_layout();
+    const unsigned int width_idx   = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH);
+    const unsigned int height_idx  = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
+    const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL);
 
-    _input          = input;
-    _output         = output;
     _conv_info      = conv_info;
     _kernel_width   = kernel_dims.width;
     _kernel_height  = kernel_dims.height;
     _dilation       = dilation;
-    _convolved_dims = scaled_dimensions(input->info()->dimension(width_idx), input->info()->dimension(height_idx),
+    _convolved_dims = scaled_dimensions(src->dimension(width_idx), dst->dimension(height_idx),
                                         _kernel_width, _kernel_height,
                                         _conv_info, _dilation);
     _has_bias = has_bias;
 
     if(_data_layout == DataLayout::NCHW)
     {
-        switch(_input->info()->data_type())
+        switch(src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, true> : &NEIm2ColKernel::run_im2col<float, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, true> : &CpuIm2ColKernel::run_im2col<float, true, true>;
                 break;
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<bfloat16, false, true> : &NEIm2ColKernel::run_im2col<bfloat16, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, true> : &CpuIm2ColKernel::run_im2col<bfloat16, true, true>;
                 break;
 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, true> : &NEIm2ColKernel::run_im2col<float16_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, true> : &CpuIm2ColKernel::run_im2col<float16_t, true, true>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8_SIGNED:
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<qasymm8_t, false, true> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, true>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<qasymm8_t, false, true> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, true>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -409,26 +380,26 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
     }
     else
     {
-        switch(_input->info()->data_type())
+        switch(src->data_type())
         {
             case DataType::F32:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float, false, false> : &NEIm2ColKernel::run_im2col<float, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float, false, false> : &CpuIm2ColKernel::run_im2col<float, true, false>;
                 break;
 #if defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16)
             case DataType::BFLOAT16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<bfloat16, false, false> : &NEIm2ColKernel::run_im2col<bfloat16, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<bfloat16, false, false> : &CpuIm2ColKernel::run_im2col<bfloat16, true, false>;
                 break;
 #endif /* defined(__ARM_FEATURE_BF16_VECTOR_ARITHMETIC) || defined(ARM_COMPUTE_FORCE_BF16) */
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
             case DataType::F16:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<float16_t, false, false> : &NEIm2ColKernel::run_im2col<float16_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<float16_t, false, false> : &CpuIm2ColKernel::run_im2col<float16_t, true, false>;
                 break;
 #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */
             case DataType::QASYMM8:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<uint8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<uint8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             case DataType::QASYMM8_SIGNED:
-                _func = (!conv_info.has_padding()) ? &NEIm2ColKernel::run_im2col<int8_t, false, false> : &NEIm2ColKernel::run_im2col<qasymm8_t, true, false>;
+                _func = (!conv_info.has_padding()) ? &CpuIm2ColKernel::run_im2col<int8_t, false, false> : &CpuIm2ColKernel::run_im2col<qasymm8_t, true, false>;
                 break;
             default:
                 ARM_COMPUTE_ERROR("Data type not supported");
@@ -436,25 +407,42 @@ void NEIm2ColKernel::configure(const ITensor *input, ITensor *output, const Size
         }
     }
 
+    // Output tensor auto initialization if not yet initialized
+    auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, false)));
+
+    std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(src->dimension(width_idx), src->dimension(height_idx),
+                                                                             kernel_dims.width, kernel_dims.height,
+                                                                             conv_info, dilation);
+
+    Window win = calculate_max_window(*src, Steps());
+    win.set(width_idx, Window::Dimension(0, convolved_dims.first, 1));
+    win.set(height_idx, Window::Dimension(0, convolved_dims.second, 1));
+    win.set(channel_idx, Window::Dimension(0, 1, 1));
     // Configure kernel window
-    auto win_config = validate_and_configure_window(input->info(), output->info(), kernel_dims, conv_info, has_bias, dilation);
-    ARM_COMPUTE_ERROR_THROW_ON(win_config.first);
-    INEKernel::configure(win_config.second);
+    ICpuKernel::configure(win);
 }
 
-Status NEIm2ColKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
-                                bool has_bias, const Size2D &dilation, unsigned int num_groups)
+Status CpuIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+                                 bool has_bias, const Size2D &dilation, unsigned int num_groups)
 {
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, output, kernel_dims, conv_info, has_bias, dilation, num_groups));
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(), output->clone().get(), kernel_dims, conv_info, has_bias, dilation).first);
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups));
     return Status{};
 }
 
-void NEIm2ColKernel::run(const Window &window, const ThreadInfo &info)
+void CpuIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info)
 {
     ARM_COMPUTE_UNUSED(info);
     ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(INEKernel::window(), window);
+    ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICpuKernel::window(), window);
 
-    (this->*_func)(window);
+    auto src = tensors.get_const_tensor(TensorType::ACL_SRC);
+    auto dst = tensors.get_tensor(TensorType::ACL_DST);
+    (this->*_func)(src, dst, window);
+}
+const char *CpuIm2ColKernel::name() const
+{
+    return "CpuIm2ColKernel";
 }
+} // namespace kernels
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/core/NEON/kernels/NEIm2ColKernel.h b/src/core/cpu/kernels/CpuIm2ColKernel.h
index 6c1c631d82..4301a237fe 100644
--- a/src/core/NEON/kernels/NEIm2ColKernel.h
+++ b/src/core/cpu/kernels/CpuIm2ColKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,16 +21,20 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_NEIM2COLKERNEL_H
-#define ARM_COMPUTE_NEIM2COLKERNEL_H
+#ifndef ARM_COMPUTE_CPU_IM2COL_KERNEL_H
+#define ARM_COMPUTE_CPU_IM2COL_KERNEL_H
 
-#include "src/core/NEON/INEKernel.h"
+#include "arm_compute/core/Size2D.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
 
 namespace arm_compute
 {
 class ITensor;
-class Size2D;
-
+namespace cpu
+{
+namespace kernels
+{
 /** Interface for the im2col reshape kernel.
  *
  * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column.
@@ -54,86 +58,66 @@ class Size2D;
  * \end{array} \right)
  * @f]
  */
-class NEIm2ColKernel : public INEKernel
+class CpuIm2ColKernel : public ICpuKernel
 {
 public:
-    const char *name() const override
-    {
-        return "NEIm2ColKernel";
-    }
     /** Default constructor */
-    NEIm2ColKernel();
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel(const NEIm2ColKernel &) = delete;
-    /** Prevent instances of this class from being copied (As this class contains pointers) */
-    NEIm2ColKernel &operator=(const NEIm2ColKernel &) = delete;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel(NEIm2ColKernel &&) = default;
-    /** Allow instances of this class to be moved */
-    NEIm2ColKernel &operator=(NEIm2ColKernel &&) = default;
-    /** Default destructor */
-    ~NEIm2ColKernel() = default;
-
+    CpuIm2ColKernel() = default;
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuIm2ColKernel);
     /** Set the input and output of the kernel.
      *
-     * @param[in]  input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
+     * @param[in]  src         The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM],
      *                         while every optional dimension from 4 and above represent a batch of inputs.
      *                         Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
      *                         Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
-     * @param[out] output      The output tensor. Data types supported: Same as @p input
+     * @param[out] dst         The output tensor info. Data types supported: Same as @p input
      * @param[in]  kernel_dims The kernel dimensions (width and height).
      * @param[in]  conv_info   Contains padding and stride information described in @ref PadStrideInfo.
      * @param[in]  has_bias    In case biases are provided expands the matrix with 1.
      * @param[in]  dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
      */
-    void configure(const ITensor *input, ITensor *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+    void configure(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
                    bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
-    /** Static function to check if given info will lead to a valid configuration of @ref NEIm2ColKernel
+    /** Static function to check if given info will lead to a valid configuration
      *
-     * @param[in] input       The input tensor to convert. 3 lower dimensions represent a single input [width, height, IFM],
-     *                        while every optional dimension from 4 and above represent a batch of inputs.
-     *                        Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32
-     *                        Note: QASYMM8/QASYMM8_SIGNED works only for has_bias = false
-     * @param[in] output      The output tensor. Data types supported: Same as @p input
-     * @param[in] kernel_dims The kernel dimensions (width and height).
-     * @param[in] conv_info   Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in] has_bias    In case biases are provided expands the matrix with 1.
-     * @param[in] dilation    (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
-     * @param[in] num_groups  (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is not supported
+     * Similar to CpuIm2ColKernel::configure()
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
+    static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info,
                            bool has_bias, const Size2D &dilation = Size2D(1U, 1U), unsigned int num_groups = 1);
 
     // Inherited methods overridden:
-    void run(const Window &window, const ThreadInfo &info) override;
+    void run_op(ITensorPack &tensors, const Window &window, const ThreadInfo &info) override;
+    const char *name() const override;
 
 private:
     /** Template function to run im2col
      *
-     * @param[in] window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
+     * @param[in]  src    The input tensor info
+     * @param[out] dst    The output tensor info
+     * @param[in]  window Region on which to execute the kernel. (Must be a valid region of the window returned by window()).
      */
     template <typename T, bool has_pads, bool is_nchw>
-    void run_im2col(const Window &window);
+    void run_im2col(const ITensor *src, ITensor *dst, const Window &window);
 
     /** Common signature for all the specialised im2col functions
      *
      * @param[in] window Region on which to execute the kernel.
      */
-    using Im2ColFunctionPtr = void (NEIm2ColKernel::*)(const Window &window);
+    using Im2ColFunctionPtr = void (CpuIm2ColKernel::*)(const ITensor *src, ITensor *dst, const Window &window);
 
-    Im2ColFunctionPtr _func;
-    const ITensor    *_input;
-    ITensor          *_output;
-    std::pair<unsigned int, unsigned int> _convolved_dims;
-    PadStrideInfo _conv_info;
-    unsigned int  _kernel_width;
-    unsigned int  _kernel_height;
-    bool          _has_bias;
-    Size2D        _dilation;
-    DataLayout    _data_layout;
+    Im2ColFunctionPtr _func{ nullptr };
+    std::pair<unsigned int, unsigned int> _convolved_dims{};
+    PadStrideInfo _conv_info{};
+    unsigned int  _kernel_width{ 0 };
+    unsigned int  _kernel_height{ 0 };
+    bool          _has_bias{ false };
+    Size2D        _dilation{ 1U, 1U };
+    DataLayout    _data_layout{ DataLayout::UNKNOWN };
 };
+} // namespace kernels
+} // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_NEIM2COLKERNEL_H */
+#endif /*ARM_COMPUTE_CPU_IM2COL_KERNEL_H */
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index f40cbda779..f333364289 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -31,8 +31,8 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/core/NEON/kernels/NECol2ImKernel.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
 #include "src/core/NEON/kernels/NEWeightsReshapeKernel.h"
+#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
 
 #include <set>
 #include <tuple>
@@ -99,7 +99,7 @@ NEGEMMConvolutionLayer::~NEGEMMConvolutionLayer() = default;
 
 NEGEMMConvolutionLayer::NEGEMMConvolutionLayer(const std::shared_ptr<IMemoryManager> &memory_manager, IWeightsManager *weights_manager)
     : _memory_group(memory_manager), _weights_manager(weights_manager), _reshape_weights(), _reshape_weights_managed(), _im2col_kernel(), _mm_gemm(memory_manager), _mm_gemmlowp(memory_manager),
-      _col2im_kernel(), _reshape_layer(), _original_weights(nullptr), _original_output(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), _tmp_output(),
+      _col2im_kernel(), _reshape_layer(), _input(nullptr), _original_weights(nullptr), _original_output(nullptr), _im2col_output(), _weights_reshaped(), _gemm_output(), _gemm_output_3d(), _tmp_output(),
       _data_layout(DataLayout::NCHW), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _is_prepared(false)
 {
 }
@@ -269,6 +269,7 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
     const unsigned int kernel_width  = weights->info()->dimension(idx_width);
     const unsigned int kernel_height = weights->info()->dimension(idx_height);
 
+    _input            = input;
     _is_prepared      = weights_info.retain_internal_weights();
     _original_weights = weights;
     _original_output  = output;
@@ -332,8 +333,8 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
         _memory_group.manage(&_im2col_output);
 
         // Configure
-        _im2col_kernel = std::make_unique<NEIm2ColKernel>();
-        _im2col_kernel->configure(input, &_im2col_output, Size2D(kernel_width, kernel_height), conv_info, false, dilation);
+        _im2col_kernel = std::make_unique<cpu::kernels::CpuIm2ColKernel>();
+        _im2col_kernel->configure(input->info(), _im2col_output.info(), Size2D(kernel_width, kernel_height), conv_info, false, dilation);
 
         // Update GEMM input
         gemm_input_to_use = &_im2col_output;
@@ -521,7 +522,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
 
         im2col_reshaped_info = TensorInfo(shape_im2col, 1, data_type);
         im2col_reshaped_info.set_quantization_info(input->quantization_info());
-        ARM_COMPUTE_RETURN_ON_ERROR(NEIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
+        ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuIm2ColKernel::validate(input, &im2col_reshaped_info, Size2D(kernel_width, kernel_height), conv_info, append_bias, dilation));
         gemm_input_to_use = &im2col_reshaped_info;
     }
 
@@ -563,7 +564,12 @@ void NEGEMMConvolutionLayer::run()
     {
         // Run input reshaping
         unsigned int y_dim = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT);
-        NEScheduler::get().schedule(_im2col_kernel.get(), y_dim);
+        ITensorPack  pack =
+        {
+            { TensorType::ACL_SRC, _input },
+            { TensorType::ACL_DST, &_im2col_output }
+        };
+        NEScheduler::get().schedule_op(_im2col_kernel.get(), y_dim, _im2col_kernel->window(), pack);
     }
 
     // Handle the case where output has top/bottom padding
diff --git a/tests/validation/NEON/Im2Col.cpp b/tests/validation/NEON/Im2Col.cpp
index 156957a601..f338675346 100644
--- a/tests/validation/NEON/Im2Col.cpp
+++ b/tests/validation/NEON/Im2Col.cpp
@@ -22,7 +22,7 @@
  * SOFTWARE.
  */
 #include "arm_compute/core/Types.h"
-#include "src/core/NEON/kernels/NEIm2ColKernel.h"
+#include "src/core/cpu/kernels/CpuIm2ColKernel.h"
 #include "tests/NEON/Accessor.h"
 #include "tests/NEON/Helper.h"
 #include "tests/datasets/ShapeDatasets.h"
@@ -57,7 +57,7 @@ const auto conv_args_small         = combine(combine(combine(combine(conv_filter
 TEST_SUITE(NEON)
 TEST_SUITE(Im2Col)
 
-using NEIm2Col = NESynthetizeFunction<NEIm2ColKernel>;
+using CpuIm2Col = NESynthetizeFunctionWithZeroConstantKernelBorder<cpu::kernels::CpuIm2ColKernel>;
 
 // *INDENT-OFF*
 // clang-format off
@@ -78,26 +78,26 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(
                framework::dataset::make("Expected", { false, false, false, false, true })),
                input_info, output_info, has_bias, expected)
 {
-    bool status = bool(NEIm2Col::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
+    bool status = bool(cpu::kernels::CpuIm2ColKernel::validate(&input_info, &output_info, Size2D(3U, 3U), PadStrideInfo(), has_bias));
     ARM_COMPUTE_EXPECT(status == expected, framework::LogLevel::ERRORS);
 }
 // clang-format on
 // *INDENT-ON*
 
 template <typename T>
-using NEIm2ColFixture = Im2ColValidationFixture<Tensor, Accessor, NEIm2Col, T, false>;
+using CpuIm2ColFixture = Im2ColOpValidationFixture<Tensor, Accessor, CpuIm2Col, T, false>;
 
 TEST_SUITE(Float)
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F32)),
-                                                                                                    conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F32)),
+                                                                                                     conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                          DataType::F32)),
-                                                                                                  conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<float>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                           DataType::F32)),
+                                                                                                   conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -107,15 +107,15 @@ TEST_SUITE_END() // FP32
 #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC
 
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F16)),
-                                                                                                   conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::F16)),
+                                                                                                    conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
-                                                                                                         DataType::F16)),
-                                                                                                 conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<half>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()), framework::dataset::make("DataType",
+                                                                                                          DataType::F16)),
+                                                                                                  conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -127,15 +127,15 @@ TEST_SUITE_END() // FP16
 TEST_SUITE_END() // Float
 
 TEST_SUITE(QASYMM8)
-FIXTURE_DATA_TEST_CASE(RunSmall, NEIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                      conv_args_small))
+FIXTURE_DATA_TEST_CASE(RunSmall, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(im2col_shapes, framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                       conv_args_small))
 {
     // Validate output
     validate(Accessor(_target), _reference);
 }
-FIXTURE_DATA_TEST_CASE(RunLarge, NEIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()),
-                                                                                                            framework::dataset::make("DataType", DataType::QASYMM8)),
-                                                                                                    conv_args))
+FIXTURE_DATA_TEST_CASE(RunLarge, CpuIm2ColFixture<uint8_t>, framework::DatasetMode::NIGHTLY, combine(combine(concat(im2col_shapes, datasets::LargeShapes()),
+                                                                                                             framework::dataset::make("DataType", DataType::QASYMM8)),
+                                                                                                     conv_args))
 {
     // Validate output
     validate(Accessor(_target), _reference);
@@ -165,8 +165,8 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     Tensor dst_target = create_tensor<Tensor>(dst_shape, data_type, 1, qinfo);
 
     // Configure target function
-    NEIm2Col im2col_func;
-    im2col_func.configure(&src_target, &dst_target, spatial_kernel, conv_info, has_bias);
+    CpuIm2Col im2col_func;
+    im2col_func.configure(src_target.info(), dst_target.info(), spatial_kernel, conv_info, has_bias);
 
     // Extend padding
     src_target.info()->extend_padding(PaddingSize(3, 5, 9, 1));
@@ -185,8 +185,13 @@ TEST_CASE(PaddedChannelNHWC, framework::DatasetMode::PRECOMMIT)
     // Fill target source
     library->fill_tensor_uniform(Accessor(src_target), 0);
 
+    ITensorPack pack =
+    {
+        { TensorType::ACL_SRC, &src_target },
+        { TensorType::ACL_DST, &dst_target }
+    };
     // Run target function
-    im2col_func.run();
+    im2col_func.run(pack);
 
     // Calculate Reference
     SimpleTensor<float> src_ref{ src_shape, data_type, 1, qinfo, data_layout };
diff --git a/tests/validation/fixtures/Im2ColFixture.h b/tests/validation/fixtures/Im2ColFixture.h
index b1fbd76eb2..38970116f6 100644
--- a/tests/validation/fixtures/Im2ColFixture.h
+++ b/tests/validation/fixtures/Im2ColFixture.h
@@ -45,6 +45,97 @@ namespace validation
 using namespace arm_compute::misc::shape_calculator;
 
 template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool batch_size_on_z>
+class Im2ColOpValidationFixture : public framework::Fixture
+{
+public:
+    template <typename...>
+    void setup(TensorShape input_shape, DataType data_type, const Size2D &kernel_dims, const PadStrideInfo &conv_info, const QuantizationInfo &quant_info, const DataLayout &data_layout,
+               unsigned int num_groups)
+    {
+        _kernel_dims = kernel_dims;
+        _conv_info   = conv_info;
+        _quant_info  = quant_info;
+        _data_layout = data_layout;
+        _has_bias    = data_type != DataType::QASYMM8;
+        _num_groups  = num_groups;
+
+        if(_data_layout == DataLayout::NHWC)
+        {
+            permute(input_shape, PermutationVector(2U, 0U, 1U));
+        }
+
+        TensorInfo input_info(input_shape, 1, data_type);
+        input_info.set_data_layout(_data_layout);
+
+        const TensorShape output_shape = compute_im2col_conv_shape(&input_info, _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), batch_size_on_z && _num_groups == 1, _num_groups);
+        _target                        = compute_target(input_shape, output_shape, data_type);
+
+        compute_reference(input_shape, output_shape, data_type);
+    }
+
+protected:
+    template <typename U>
+    void fill(U &&tensor)
+    {
+        library->fill_tensor_uniform(tensor, 0);
+    }
+
+    TensorType compute_target(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    {
+        // Create tensors
+        TensorType src = create_tensor<TensorType>(input_shape, data_type, 1, _quant_info, _data_layout);
+        TensorType dst = create_tensor<TensorType>(output_shape, data_type, 1, _quant_info);
+
+        // Create and configure function
+        FunctionType im2col_func;
+        im2col_func.configure(src.info(), dst.info(), _kernel_dims, _conv_info, _has_bias, Size2D(1U, 1U), _num_groups);
+
+        ARM_COMPUTE_ASSERT(src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(dst.info()->is_resizable());
+
+        // Allocate tensors
+        src.allocator()->allocate();
+        dst.allocator()->allocate();
+
+        ARM_COMPUTE_ASSERT(!src.info()->is_resizable());
+        ARM_COMPUTE_ASSERT(!dst.info()->is_resizable());
+
+        // Fill tensors
+        fill(AccessorType(src));
+
+        arm_compute::ITensorPack pack =
+        {
+            { arm_compute::TensorType::ACL_SRC, &src },
+            { arm_compute::TensorType::ACL_DST, &dst }
+        };
+        // Compute function
+        im2col_func.run(pack);
+
+        return dst;
+    }
+
+    void compute_reference(const TensorShape &input_shape, const TensorShape &output_shape, DataType data_type)
+    {
+        // Create reference
+        SimpleTensor<T> src{ input_shape, data_type, 1, _quant_info, _data_layout };
+        _reference = SimpleTensor<T>(output_shape, data_type, 1, _quant_info, DataLayout::NCHW);
+
+        // Fill reference
+        fill(src);
+
+        reference::im2col<T>(src, _reference, _kernel_dims, _conv_info, _has_bias, _num_groups);
+    }
+    TensorType       _target{};
+    SimpleTensor<T>  _reference{};
+    Size2D           _kernel_dims{};
+    PadStrideInfo    _conv_info{};
+    DataLayout       _data_layout{};
+    QuantizationInfo _quant_info{};
+    bool             _has_bias{};
+    unsigned int     _num_groups{};
+};
+
+template <typename TensorType, typename AccessorType, typename FunctionType, typename T, bool batch_size_on_z>
 class Im2ColValidationFixture : public framework::Fixture
 {
 public:
author	Manuel Bottini <manuel.bottini@arm.com>	2021-06-30 18:29:18 +0100
committer	Manuel Bottini <manuel.bottini@arm.com>	2021-07-06 11:03:31 +0000
commit	900289936c458eff95499e0a0eaba989a27aaa4d (patch)
tree	305853a38fd66842d19aa1a2d1cad88a70b946bc
parent	6132c7aeaf6230a4e8b074309327762a9e4be003 (diff)
download	ComputeLibrary-900289936c458eff95499e0a0eaba989a27aaa4d.tar.gz