Create CpuGemmDirectConv2d

As the first phase of making NEGEMMConv2d stateless, CpuGemmDirectConv2d operator is created. Kernels and operators used by the operator use TensorInfo pointers instead of Tensor pointers. The CpuGemmDirectConv2d isn't completely stateless because it manages one intermediate tensor internally. This will be resolved by implementing memory injection mechanism with the following patches. Also, weight manager of CpuGemmAssemblyDispatch is disabled to enable this work. Implements: COMPMID-4506 Change-Id: Iec3ca6de29d98bef7ea95e8f4473d6dc0024a140 Signed-off-by: Sang-Hoon Park <sang-hoon.park@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5672 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
author: Sang-Hoon Park <sang-hoon.park@arm.com> 2021-05-17 17:04:50 +0100
committer: Sang-Hoon Park <sang-hoon.park@arm.com> 2021-05-26 10:16:05 +0000
commit: d89e2faa60d148f3c04e57032a28f1065a1be0e8 (patch)
tree: c95eb97f9c79198cb5db1232b497491df10614f2
parent: 8b83d4684249bb96e27f95e11cf8f38a1c33b82b (diff)
download: ComputeLibrary-d89e2faa60d148f3c04e57032a28f1065a1be0e8.tar.gz
12 files changed, 484 insertions, 251 deletions
diff --git a/Android.bp b/Android.bp
index c4438f9e76..13b70ea0b3 100644
--- a/Android.bp
+++ b/Android.bp
@@ -644,6 +644,7 @@ cc_library_static {
         "src/runtime/cpu/operators/CpuFill.cpp",
         "src/runtime/cpu/operators/CpuFlatten.cpp",
         "src/runtime/cpu/operators/CpuFloor.cpp",
+        "src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp",
         "src/runtime/cpu/operators/CpuMul.cpp",
         "src/runtime/cpu/operators/CpuPermute.cpp",
         "src/runtime/cpu/operators/CpuPooling.cpp",
diff --git a/SConscript b/SConscript
index e5f7a8a938..ac15c7fcfb 100644
--- a/SConscript
+++ b/SConscript
@@ -375,6 +375,7 @@ if env['neon']:
                           'src/runtime/cpu/operators/CpuSoftmax.cpp',
                           'src/runtime/cpu/operators/CpuSub.cpp',
                           'src/runtime/cpu/operators/CpuTranspose.cpp',
+                          'src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp',
                          ]
     cpu_internal_operator_files = ['src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp',]
     runtime_files += [ cpu_rt_files, cpu_operator_hp_files, cpu_operator_files, cpu_internal_operator_files ]
diff --git a/arm_compute/runtime/NEON/functions/NEGEMM.h b/arm_compute/runtime/NEON/functions/NEGEMM.h
index 9df2e08956..6fa30bd545 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMM.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMM.h
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_NEGEMM_H
 #define ARM_COMPUTE_NEGEMM_H
 
+#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
@@ -144,6 +145,8 @@ private:
     bool           _run_activation;
     bool           _reshape_b_only_on_first_run;
     bool           _is_prepared;
+
+    ITensorPack _asm_glue_tensors{};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMM_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
index 6c71f0e188..f39ce4dfa3 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMConv2d.h
@@ -27,15 +27,13 @@
 #include "arm_compute/runtime/FunctionDescriptors.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
-#include "arm_compute/runtime/NEON/functions/NEActivationLayer.h"
-#include "arm_compute/runtime/NEON/functions/NEPermute.h"
-#include "arm_compute/runtime/Tensor.h"
 
 #include <memory>
 namespace arm_compute
 {
 // Forward declarations
 class ITensor;
+class ITensorInfo;
 namespace cpu
 {
 class CpuGemmAssemblyDispatch;
@@ -114,13 +112,8 @@ public:
     void prepare() override;
 
 private:
-    std::unique_ptr<cpu::CpuGemmAssemblyDispatch> _gemm_asm_func;
-    NEActivationLayer                             _activation_func;
-    NEPermute                                     _weights_permute_func;
-    const ITensor                                *_original_weights;
-    Tensor                                        _permuted_weights;
-    bool                                          _is_prepared;
-    bool                                          _run_activation;
+    struct Impl;
+    std::unique_ptr<Impl> _impl;
 };
 } // namespace arm_compute
 #endif /* ARM_COMPUTE_NEGEMMCONV2D_H */
diff --git a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
index a292712bd7..dc9783f9eb 100644
--- a/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
+++ b/arm_compute/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.h
@@ -25,6 +25,7 @@
 #define ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H
 
 #include "NEActivationLayer.h"
+#include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
@@ -169,6 +170,8 @@ private:
     bool _fuse_output_stage;
     bool _run_activation;
     bool _flip_signedness;
+
+    ITensorPack _asm_glue_tensors{};
 };
 } // namespace arm_compute
 #endif /*ARM_COMPUTE_NEGEMMLOWPMATRIXMULTIPLYCORE_H */
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index b84128e6c0..7318c3e492 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -89,10 +89,19 @@ void NEGEMM::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITe
 
     if(run_optimised)
     {
-        const ITensor *c_to_use = is_c_bias ? c : nullptr;
-        _asm_glue->configure(a, b, c_to_use, d, asm_info);
+        const ITensor     *c_to_use      = is_c_bias ? c : nullptr;
+        const ITensorInfo *c_info_to_use = c_to_use != nullptr ? c_to_use->info() : nullptr;
+        _asm_glue->configure(a->info(), b->info(), c_info_to_use, d->info(), asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
 
+        _asm_glue_tensors =
+        {
+            { ACL_SRC_0, a },
+            { ACL_SRC_1, b },
+            { ACL_SRC_2, c_to_use },
+            { ACL_DST, d },
+        };
+
         // Scale product by alpha
         if(_run_alpha_scale)
         {
@@ -314,7 +323,7 @@ void NEGEMM::run()
 
     if(_asm_glue->is_configured())
     {
-        _asm_glue->run();
+        _asm_glue->run(_asm_glue_tensors);
         if(_run_alpha_scale)
         {
             _alpha_scale_func.run();
@@ -368,7 +377,7 @@ void NEGEMM::prepare()
                 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
             }
 
-            _asm_glue->prepare();
+            _asm_glue->prepare(_asm_glue_tensors);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
diff --git a/src/runtime/NEON/functions/NEGEMMConv2d.cpp b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
index ddeacc85f5..94ceb6d27c 100644
--- a/src/runtime/NEON/functions/NEGEMMConv2d.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConv2d.cpp
@@ -26,151 +26,48 @@
 #include "arm_compute/core/utils/misc/ShapeCalculator.h"
 #include "arm_compute/core/utils/quantization/AsymmHelpers.h"
 #include "arm_compute/runtime/NEON/NEScheduler.h"
-#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
 
 #include <set>
 
 namespace arm_compute
 {
-namespace
-{
-GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *output, const ActivationLayerInfo &act)
-{
-    // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
-    // Extract and negate input and weights offset
-    const QuantizationInfo        iqinfo    = input->quantization_info();
-    const QuantizationInfo        wqinfo    = weights->quantization_info();
-    const QuantizationInfo        oqinfo    = (output->total_size() == 0) ? iqinfo : output->quantization_info();
-    const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
-    const DataType                data_type = input->data_type();
-    // Merge activation with output stage
-    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
-                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
-                                                                             };
-    PixelValue type_min{};
-    PixelValue type_max{};
-    std::tie(type_min, type_max) = get_min_max(data_type);
-    int32_t min_activation = type_min.get<int32_t>();
-    int32_t max_activation = type_max.get<int32_t>();
-    if(supported_acts.count(act.activation()) != 0)
-    {
-        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
-    }
-    GEMMLowpOutputStageInfo os_info;
-    os_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
-    os_info.gemmlowp_offset          = uoqinfo.offset;
-    os_info.gemmlowp_min_bound       = min_activation;
-    os_info.gemmlowp_max_bound       = max_activation;
-    os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
-    quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
-    return os_info;
-}
-cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+using OperatorType = cpu::CpuGemmDirectConv2d;
+
+struct NEGEMMConv2d::Impl
 {
-    cpu::AsmGemmInfo asm_info;
-    asm_info.method                  = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv;
-    asm_info.ps_info                 = info.conv_info;
-    asm_info.activation_info         = info.act_info;
-    asm_info.depth_output_gemm3d     = true;
-    asm_info.reinterpret_input_as_3d = true;
-    asm_info.padding_top             = info.conv_info.pad_top();
-    asm_info.padding_left            = info.conv_info.pad_left();
-    asm_info.padding_value           = 0.f;
-    asm_info.negated_offsets         = false;
-    return asm_info;
-}
-} // namespace
+    ITensorPack                   tensors{};
+    std::unique_ptr<OperatorType> op{ nullptr };
+};
 
 NEGEMMConv2d::NEGEMMConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
-    : _gemm_asm_func(std::make_unique<cpu::CpuGemmAssemblyDispatch>(memory_manager)), _activation_func(), _weights_permute_func(), _original_weights(nullptr), _permuted_weights(), _is_prepared(false),
-      _run_activation(false)
+    : _impl(std::make_unique<Impl>())
 {
+    _impl->op = std::make_unique<OperatorType>(memory_manager);
 }
 
 NEGEMMConv2d::~NEGEMMConv2d() = default;
 
 void NEGEMMConv2d::configure(ITensor *input, const ITensor *weights, const ITensor *biases, ITensor *output, const Conv2dInfo &info)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_ERROR_THROW_ON(NEGEMMConv2d::validate(input->info(),
-                                                      weights->info(),
-                                                      biases != nullptr ? biases->info() : nullptr,
-                                                      output->info(),
-                                                      info));
-    _original_weights = weights;
-    _weights_permute_func.configure(weights, &_permuted_weights, PermutationVector{ 3, 0, 1, 2 });
+    _impl->tensors.add_const_tensor(TensorType::ACL_SRC_0, input);
+    _impl->tensors.add_const_tensor(TensorType::ACL_SRC_1, weights);
+    _impl->tensors.add_const_tensor(TensorType::ACL_SRC_2, biases);
+    _impl->tensors.add_tensor(TensorType::ACL_DST, output);
 
-    // Configure assembly dispatch
-    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    if(is_data_type_quantized(input->info()->data_type()))
-    {
-        asm_info.output_stage = calculate_output_stage_metadata(input->info(), weights->info(), output->info(), info.act_info);
-    }
-    _gemm_asm_func->configure(input, &_permuted_weights, biases, output, asm_info);
-
-    // Configure activation
-    if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info))
-    {
-        _activation_func.configure(output, nullptr, info.act_info);
-        _run_activation = true;
-    }
+    _impl->op->configure(input->info(), weights->info(), biases->info(), output->info(), info);
 }
+
 Status NEGEMMConv2d::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &info)
 {
-    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, weights, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
-    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(input, weights);
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
-    const DataType    data_type = input->data_type();
-    const TensorShape i_shape   = input->tensor_shape();
-    const TensorShape w_shape   = weights->tensor_shape();
-    ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
-    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
-    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
-    // Validate biases
-    if(biases != nullptr)
-    {
-        if(is_data_type_quantized_asymmetric(data_type))
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
-        }
-        else if(data_type == DataType::BFLOAT16)
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
-        }
-        else
-        {
-            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases);
-        }
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
-        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
-    }
-
-    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
-    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(input, weights, biases, output, asm_info));
-    return Status{};
+    return OperatorType::validate(input, weights, biases, output, info);
 }
 void NEGEMMConv2d::run()
 {
-    prepare();
-
-    _gemm_asm_func->run();
-    if(_run_activation)
-    {
-        _activation_func.run();
-    }
+    _impl->op->run(_impl->tensors);
 }
 void NEGEMMConv2d::prepare()
 {
-    if(!_is_prepared)
-    {
-        _permuted_weights.allocator()->allocate();
-        _weights_permute_func.run();
-        _original_weights->mark_as_unused();
-        _is_prepared = true;
-    }
+    _impl->op->prepare(_impl->tensors);
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 53dd39e549..cc0f20e695 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -146,14 +146,21 @@ void NEGEMMLowpMatrixMultiplyCore::configure(const ITensor *a, const ITensor *b,
         {
             if(is_data_type_quantized_asymmetric(a_to_use->info()->data_type()) && info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT)
             {
-                _asm_glue->configure(a_to_use, b, c, output, asm_info);
+                auto c_info_to_use = c == nullptr ? nullptr : c->info();
+                _asm_glue->configure(a_to_use->info(), b->info(), c_info_to_use, output->info(), asm_info);
                 _fused_assembly_path = _asm_glue->is_configured();
+                _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_2, c);
+                _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output);
             }
             else
             {
-                _asm_glue->configure(a_to_use, b, nullptr, _fuse_output_stage ? &_mm_result_s32 : output, asm_info);
+                auto output_to_use = (_fuse_output_stage ? &_mm_result_s32 : output);
+                _asm_glue->configure(a_to_use->info(), b->info(), nullptr, output_to_use->info(), asm_info);
+                _asm_glue_tensors.add_tensor(TensorType::ACL_DST, output_to_use);
             }
             _assembly_path = _asm_glue->is_configured();
+            _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_0, a_to_use);
+            _asm_glue_tensors.add_const_tensor(TensorType::ACL_SRC_1, b);
             break;
         }
         default:
@@ -513,7 +520,7 @@ void NEGEMMLowpMatrixMultiplyCore::run()
     // Run GEMM
     if(_asm_glue->is_configured())
     {
-        _asm_glue->run();
+        _asm_glue->run(_asm_glue_tensors);
     }
     else
     {
@@ -583,7 +590,7 @@ void NEGEMMLowpMatrixMultiplyCore::prepare()
                 ARM_COMPUTE_ERROR_ON(!_original_b->is_used());
             }
 
-            _asm_glue->prepare();
+            _asm_glue->prepare(_asm_glue_tensors);
             if(!original_b_managed_by_weights_manager)
             {
                 _original_b->mark_as_unused();
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
new file mode 100644
index 0000000000..b47a08a5e9
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -0,0 +1,206 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/runtime/cpu/operators/CpuGemmDirectConv2d.h"
+
+#include "arm_compute/core/utils/misc/ShapeCalculator.h"
+#include "arm_compute/core/utils/quantization/AsymmHelpers.h"
+#include "arm_compute/runtime/FunctionDescriptors.h"
+#include "arm_compute/runtime/NEON/NEScheduler.h"
+#include "src/runtime/cpu/operators/CpuActivation.h"
+#include "src/runtime/cpu/operators/CpuPermute.h"
+#include "src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
+
+#include <set>
+
+namespace arm_compute
+{
+namespace cpu
+{
+namespace
+{
+GEMMLowpOutputStageInfo calculate_output_stage_metadata(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const ActivationLayerInfo &act)
+{
+    // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
+    // Extract and negate input and weights offset
+    const QuantizationInfo        iqinfo    = src->quantization_info();
+    const QuantizationInfo        wqinfo    = weights->quantization_info();
+    const QuantizationInfo        oqinfo    = (dst->total_size() == 0) ? iqinfo : dst->quantization_info();
+    const UniformQuantizationInfo uoqinfo   = oqinfo.uniform();
+    const DataType                data_type = src->data_type();
+    // Merge activation with output stage
+    const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
+                                                                               ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
+                                                                             };
+    PixelValue                                              type_min{};
+    PixelValue                                              type_max{};
+    std::tie(type_min, type_max) = get_min_max(data_type);
+    int32_t min_activation       = type_min.get<int32_t>();
+    int32_t max_activation       = type_max.get<int32_t>();
+    if(supported_acts.count(act.activation()) != 0)
+    {
+        std::tie(min_activation, max_activation) = get_quantized_activation_min_max(act, data_type, uoqinfo);
+    }
+    GEMMLowpOutputStageInfo os_info;
+    os_info.type                     = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT;
+    os_info.gemmlowp_offset          = uoqinfo.offset;
+    os_info.gemmlowp_min_bound       = min_activation;
+    os_info.gemmlowp_max_bound       = max_activation;
+    os_info.is_quantized_per_channel = (weights->data_type() == DataType::QSYMM8_PER_CHANNEL);
+    quantization::calculate_quantized_multipliers(iqinfo, wqinfo, oqinfo, os_info);
+    return os_info;
+}
+cpu::AsmGemmInfo init_assembly_metadata(const Conv2dInfo &info, bool is_indirect)
+{
+    cpu::AsmGemmInfo asm_info;
+    asm_info.method                  = is_indirect ? cpu::AsmConvMethod::Indirect : cpu::AsmConvMethod::Conv;
+    asm_info.ps_info                 = info.conv_info;
+    asm_info.activation_info         = info.act_info;
+    asm_info.depth_output_gemm3d     = true;
+    asm_info.reinterpret_input_as_3d = true;
+    asm_info.padding_top             = info.conv_info.pad_top();
+    asm_info.padding_left            = info.conv_info.pad_left();
+    asm_info.padding_value           = 0.f;
+    asm_info.negated_offsets         = false;
+    return asm_info;
+}
+} // namespace
+
+CpuGemmDirectConv2d::CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager)
+    : _gemm_asm_func(std::make_unique<CpuGemmAssemblyDispatch>(memory_manager)),
+      _activation_func(std::make_unique<CpuActivation>()),
+      _weights_permute_func(std::make_unique<CpuPermute>()),
+      _permuted_weights_info(),
+      _permuted_weights(std::make_unique<Tensor>())
+{
+}
+
+CpuGemmDirectConv2d::~CpuGemmDirectConv2d() = default;
+
+void CpuGemmDirectConv2d::configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info)
+{
+    ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_ERROR_THROW_ON(CpuGemmDirectConv2d::validate(src,
+                                                             weights,
+                                                             biases != nullptr ? biases : nullptr,
+                                                             dst,
+                                                             info));
+    _original_weights_info = weights;
+    _weights_permute_func->configure(weights, &_permuted_weights_info, PermutationVector{ 3, 0, 1, 2 });
+
+    // Configure assembly dispatch
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    if(is_data_type_quantized(src->data_type()))
+    {
+        asm_info.output_stage = calculate_output_stage_metadata(src, weights, dst, info.act_info);
+    }
+    _gemm_asm_func->configure(src, &_permuted_weights_info, biases, dst, asm_info);
+
+    // Configure activation
+    if(info.act_info.enabled() && !_gemm_asm_func->is_activation_supported(info.act_info))
+    {
+        _activation_func->configure(dst, nullptr, info.act_info);
+        _run_activation = true;
+    }
+}
+Status CpuGemmDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info)
+{
+    ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(weights, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::BFLOAT16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(info.num_groups > 1, "Grouping (num_groups != 1) is not supported on Neon");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NHWC, "Data layout supported is NHWC");
+    const DataType    data_type = src->data_type();
+    const TensorShape i_shape   = src->tensor_shape();
+    const TensorShape w_shape   = weights->tensor_shape();
+    ARM_COMPUTE_RETURN_ERROR_ON(w_shape[0] != i_shape[0]);
+    ARM_COMPUTE_RETURN_ERROR_ON(info.dilation != Size2D(1U, 1U));
+    ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4);
+    // Validate biases
+    if(biases != nullptr)
+    {
+        if(is_data_type_quantized_asymmetric(data_type))
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32);
+        }
+        else if(data_type == DataType::BFLOAT16)
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::F32);
+        }
+        else
+        {
+            ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases);
+        }
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(3));
+        ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1);
+    }
+
+    cpu::AsmGemmInfo asm_info = init_assembly_metadata(info, false);
+    ARM_COMPUTE_RETURN_ON_ERROR(cpu::CpuGemmAssemblyDispatch::validate(src, weights, biases, dst, asm_info));
+    return Status{};
+}
+void CpuGemmDirectConv2d::run(ITensorPack &tensors)
+{
+    prepare(tensors);
+
+    _gemm_asm_func->run(tensors);
+    if(_run_activation)
+    {
+        _activation_func->run(tensors);
+    }
+}
+
+void CpuGemmDirectConv2d::allocate_permuted_weights()
+{
+    // TODO: This function will be removed when memory injection is implemeted.
+    ARM_COMPUTE_ERROR_ON(_permuted_weights == nullptr);
+    _permuted_weights->allocator()->free();
+    _permuted_weights->allocator()->init(_permuted_weights_info);
+    _permuted_weights->allocator()->allocate();
+}
+
+void CpuGemmDirectConv2d::prepare(ITensorPack &tensors)
+{
+    if(!_is_prepared)
+    {
+        allocate_permuted_weights();
+        ITensorPack permute_tensors
+        {
+            { TensorType::ACL_SRC, tensors.get_const_tensor(TensorType::ACL_SRC_1) },
+            { TensorType::ACL_DST, _permuted_weights.get() },
+        };
+
+        _weights_permute_func->run(permute_tensors);
+
+        tensors.get_const_tensor(TensorType::ACL_SRC_1)->mark_as_unused();
+
+        // switch the original tensor with permuted tensor
+        tensors.add_const_tensor(TensorType::ACL_SRC_1, _permuted_weights.get());
+        _is_prepared = true;
+    }
+}
+
+} // namespace cpu
+} // namespace arm_compute
+\ No newline at end of file
diff --git a/src/runtime/cpu/operators/CpuGemmDirectConv2d.h b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
new file mode 100644
index 0000000000..6aa17c2349
--- /dev/null
+++ b/src/runtime/cpu/operators/CpuGemmDirectConv2d.h
@@ -0,0 +1,112 @@
+/*
+ * Copyright (c) 2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
+#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
+
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/experimental/Types.h"
+#include "arm_compute/runtime/Tensor.h"
+#include "src/core/common/Macros.h"
+#include "src/core/cpu/ICpuKernel.h"
+#include "src/runtime/cpu/ICpuOperator.h"
+
+#include <memory>
+
+namespace arm_compute
+{
+// Forward declarations
+class ITensor;
+struct Conv2dInfo;
+namespace cpu
+{
+class CpuGemmAssemblyDispatch;
+class CpuActivation;
+class CpuPermute;
+
+class CpuGemmDirectConv2d : public ICpuOperator
+{
+public:
+    /** Constructor */
+    CpuGemmDirectConv2d(const std::shared_ptr<IMemoryManager> &memory_manager = nullptr);
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmDirectConv2d);
+    /** Destructor */
+    ~CpuGemmDirectConv2d();
+    /** Set the input and output tensors.
+     *
+     * Valid data layouts:
+     * - All
+     *
+     * Valid data type configurations:
+     * |src0           |src1           |src2           |dst            |
+     * |:--------------|:--------------|:--------------|:--------------|
+     * |QASYMM8        |QASYMM8        |S32            |QASYMM8        |
+     * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32            |QASYMM8_SIGNED |
+     * |F16            |F16            |F16            |F16            |
+     * |F32            |F32            |F32            |F32            |
+     * |BFLOAT16       |BFLOAT16       |BFLOAT16       |BFLOAT16       |
+     *
+     * @param[in] src     Source tensor info. 3 lower dimensions represent a single input [width, height, IFM],
+     *                    while every optional dimension from 4 and above represent a batch of inputs.
+     *                    Data types supported: QASYMM8/QASYMM8_SIGNED/BFLOAT16/F16/F32.
+     * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM].
+     *                    Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/BFLOAT16/F16/F32.
+     * @param[in] biases  Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].
+     *                    Data type supported: Should match @p input data type, except for input of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type.
+     * @param[in] dst     Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
+     *                    Data types supported: Same as @p input.
+     * @param[in] info    Contains padding and stride information described in @ref PadStrideInfo.
+     */
+    void configure(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &info);
+    /** Static function to check if given info will lead to a valid configuration of @ref CpuGemmDirectConv2d
+     *
+     * Similar to CpuGemmDirectConv2d::configure()
+     *
+     * @return a status
+     */
+    static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &info);
+
+    // Inherited methods overridden:
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &constants) override;
+
+private:
+    std::unique_ptr<CpuGemmAssemblyDispatch> _gemm_asm_func;
+    std::unique_ptr<CpuActivation>           _activation_func;
+    std::unique_ptr<CpuPermute>              _weights_permute_func;
+    const ITensorInfo                       *_original_weights_info{};
+    TensorInfo                               _permuted_weights_info;
+    std::unique_ptr<Tensor>                  _permuted_weights{ nullptr };
+    bool                                     _is_prepared{ false };
+    bool                                     _run_activation{ false };
+
+    /** Function to allocated a tensor for permuted weights
+     *
+     * @note This function will be removed when memory injection is properly implemented.
+     */
+    void allocate_permuted_weights();
+};
+} // namespace cpu
+} // namespace arm_compute
+
+#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 36c1bbb1b3..0c511ff548 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -56,14 +56,13 @@ struct Params
     bool         indirect;
 };
 
-Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d, const AsmGemmInfo &info)
+Params extract_parameters(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
-
     Params p;
-    p.M        = d->info()->tensor_shape().y();
-    p.K        = a->info()->tensor_shape().x();
-    p.N        = d->info()->tensor_shape().x();
+    p.M        = d->tensor_shape().y();
+    p.K        = a->tensor_shape().x();
+    p.N        = d->tensor_shape().x();
     p.batches  = 1;
     p.multis   = 1;
     p.sections = 1;
@@ -72,19 +71,19 @@ Params extract_parameters(const ITensor *a, const ITensor *b, const ITensor *d,
     if(info.method == AsmConvMethod::Conv || info.method == AsmConvMethod::Indirect)
     {
         p.indirect = true;
-        p.sections = b->info()->tensor_shape()[2] * b->info()->tensor_shape()[3];
+        p.sections = b->tensor_shape()[2] * b->tensor_shape()[3];
     }
     else
     {
-        p.multis  = b->info()->tensor_shape().z();
-        p.batches = d->info()->tensor_shape().total_size_upper(2) / p.multis;
+        p.multis  = b->tensor_shape().z();
+        p.batches = d->tensor_shape().total_size_upper(2) / p.multis;
     }
 
     // Update M in case of GEMM3D for output
     if(info.depth_output_gemm3d != 0)
     {
-        p.M       = d->info()->tensor_shape().y() * d->info()->tensor_shape().z();
-        p.batches = d->info()->tensor_shape().total_size_upper(3) / p.multis;
+        p.M       = d->tensor_shape().y() * d->tensor_shape().z();
+        p.batches = d->tensor_shape().total_size_upper(3) / p.multis;
     }
 
     return p;
@@ -205,11 +204,11 @@ public:
     }
 
 private:
-    Tensor           _output{};
-    int              _ldb{};
-    const TypeInput *_in1_ptr{};
-    int              _multi_stride_b{};
-    size_t           _B_pretranspose_size{};
+    Tensor                                                       _output{};
+    int                                                          _ldb{};
+    const TypeInput                                             *_in1_ptr{};
+    int                                                          _multi_stride_b{};
+    size_t                                                       _B_pretranspose_size{};
     std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
 };
 
@@ -221,8 +220,7 @@ public:
     /** Destructor */
     ~Fallback()
     {
-        // Release memory if we have allocated the memory ourselves
-        if(_pretranspose && !(_weights_manager && _weights_manager->are_weights_managed(_b)))
+        if(_pretranspose && !(is_weight_managed()))
         {
             delete _pretranspose;
         }
@@ -240,7 +238,7 @@ public:
      * @param[in]  weights_manager Weights manager to be used by the function.
      * @param[in]  os              Output stage meta-data.
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
+    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
                    arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
                    MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os = {});
 
@@ -262,8 +260,8 @@ public:
                                                                                             const std::vector<int32_t> &multipliers);
 
     // Inherited methods overridden:
-    void run() override;
-    void prepare() override;
+    void run(ITensorPack &tensors) override;
+    void prepare(ITensorPack &tensors) override;
     bool is_configured() const override;
 
 private:
@@ -283,28 +281,12 @@ private:
      */
     void configure_indirect(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *d, const AsmGemmInfo &info);
     /** Prepare the indirect buffer */
-    void prepare_indirect_buffer();
+    void prepare_indirect_buffer(ITensorPack &tensors);
 
     /** Assembly Gemm kernel */
     std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{ nullptr };
     /** Optimised Arm® Neon™ kernel */
     std::unique_ptr<INEKernel> _optimised_kernel{ nullptr };
-    /** Input A */
-    const ITensor *_a
-    {
-        nullptr
-    };
-    /** Input B */
-    const ITensor *_b
-    {
-        nullptr
-    };
-    const ITensor *_c
-    {
-        nullptr
-    };
-    /** Output */
-    ITensor *_d{ nullptr };
     /** GEMM workspace */
     Tensor _workspace{};
     /** Pre-transpose tensor */
@@ -328,8 +310,27 @@ private:
     /** Indirect buffer */
     std::unique_ptr<const TypeInput *const *, free_delete> _indirect_arg{};
     std::unique_ptr<const TypeInput *, free_delete>        _indirect_buf{};
-    std::vector<TypeInput>          _indirect_pad{};
-    arm_gemm::ConvolutionParameters _cp{};
+    std::vector<TypeInput>                                 _indirect_pad{};
+    arm_gemm::ConvolutionParameters                        _cp{};
+
+    bool is_weight_managed()
+    {
+        // TODO (COMPMID-4539): This function should do the following:
+        // _weights_manager && _weights_manager->are_weights_managed(_b)
+        // , where _b is the second Tensor that is used to be given to the configure().
+        // Currently, however, weight manager is disabled to make this class stateless.
+        // This should be revisited in the future.
+        return false;
+    }
+
+    void acquire_managed_weight()
+    {
+        // TODO (COMPMID-4539): This function should do the following:
+        // _pretranspose = _weights_manager->acquire(_b, &_weights_transform);
+        // , where _b is the second Tensor that is used to be given to the configure().
+        // Currently, however, weight manager is disabled to make this class stateless.
+        _pretranspose = nullptr;
+    }
 };
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
@@ -352,14 +353,15 @@ Fallback<TypeInput, TypeOutput, OutputStage>::set_requantize_data(const std::vec
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer()
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare_indirect_buffer(ITensorPack &tensors)
 {
-    const TypeInput *A_ptr          = reinterpret_cast<TypeInput *>(_a->buffer());
+    auto             a              = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    const TypeInput *A_ptr          = reinterpret_cast<TypeInput *>(a->buffer());
     const int        multis         = 1;
-    const int        batches        = _a->info()->tensor_shape().total_size_upper(3);
-    const size_t     stride_A       = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
-    const size_t     batch_stride_A = _a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
-    const size_t     multi_stride_A = _a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
+    const int        batches        = a->info()->tensor_shape().total_size_upper(3);
+    const size_t     stride_A       = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    const size_t     batch_stride_A = a->info()->strides_in_bytes()[3] / sizeof(TypeInput);
+    const size_t     multi_stride_A = a->info()->strides_in_bytes()[4] / sizeof(TypeInput);
 
     const size_t output_hw    = _cp.output_height * _cp.output_width;
     const int    batch_size   = _cp.kernel_height * _cp.kernel_width * output_hw * sizeof(TypeInput);
@@ -466,10 +468,11 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure_indirect(const ITen
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d,
+void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d,
                                                              arm_gemm::GemmArgs args, const AsmGemmInfo &gemm_info,
                                                              MemoryGroup &memory_group, IWeightsManager *weights_manager, const OutputStage &os)
 {
+    ARM_COMPUTE_UNUSED(c);
     arm_gemm::GemmConfig gemm_cfg;
     _kernel_info     = arm_gemm::get_gemm_method<TypeInput, TypeOutput, OutputStage>(args, os);
     _weights_manager = weights_manager;
@@ -508,10 +511,6 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
     }
 
     _optimised_kernel = std::move(acl_gemm_wrapper);
-    _a                = a;
-    _b                = b;
-    _c                = c;
-    _d                = d;
     _gemm_info        = gemm_info;
     // Check for pre-transposed support
     if(_gemm_kernel_asm->B_pretranspose_required())
@@ -519,10 +518,10 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
-        if(weights_manager && _weights_manager->are_weights_managed(b))
+        if(is_weight_managed())
         {
             _weights_transform.configure(B_pretranspose_size, alignment);
-            _pretranspose = _weights_manager->acquire(b, &_weights_transform);
+            acquire_managed_weight();
         }
         else
         {
@@ -534,32 +533,34 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
     // Handle indirect GEMM convolution
     if(gemm_info.method == AsmConvMethod::Conv || gemm_info.method == AsmConvMethod::Indirect)
     {
-        configure_indirect(a->info(), b->info(), d->info(), gemm_info);
+        configure_indirect(a, b, d, gemm_info);
     }
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::prepare()
+void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
 {
+    auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
     if(!_is_prepared)
     {
         // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
-        if(_c && _c->info()->data_type() == DataType::S32)
+        if(c && c->info()->data_type() == DataType::S32)
         {
-            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(_c->buffer() + _c->info()->offset_first_element_in_bytes()), 0);
+            _gemm_kernel_asm->set_quantized_bias(reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
         }
 
         // Pretranspose B if required
         if(_gemm_kernel_asm->B_pretranspose_required())
         {
-            const int  ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
-            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
-            const int  multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+            const int  ldb            = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
 
-            if(_weights_manager && _weights_manager->are_weights_managed(_b))
+            if(is_weight_managed())
             {
                 _weights_transform.set_args(ldb, in1_ptr, multi_stride_b, _gemm_kernel_asm);
-                _weights_manager->run(_b, &_weights_transform);
+                _weights_manager->run(b, &_weights_transform);
 
                 // If we didn't run the reshape function, set the pretransposed buffer
                 if(!_weights_transform.is_reshape_run())
@@ -572,13 +573,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare()
                 static_cast<Tensor *>(_pretranspose)->allocator()->allocate();
                 ARM_COMPUTE_ERROR_ON(_pretranspose->buffer() == nullptr);
                 _gemm_kernel_asm->pretranspose_B_array(_pretranspose->buffer(), in1_ptr, ldb, multi_stride_b);
-                _b->mark_as_unused();
+                b->mark_as_unused();
             }
         }
 
         if(_gemm_info.method == AsmConvMethod::Indirect)
         {
-            prepare_indirect_buffer();
+            prepare_indirect_buffer(tensors);
         }
 
         _is_prepared = true;
@@ -601,37 +602,42 @@ bool Fallback<TypeInput, TypeOutput, OutputStage>::is_configured() const
 }
 
 template <typename TypeInput, typename TypeOutput, class OutputStage>
-void Fallback<TypeInput, TypeOutput, OutputStage>::run()
+void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
 {
-    int       lda = _a->info()->strides_in_bytes().y() / sizeof(TypeInput);
+    auto a = tensors.get_const_tensor(TensorType::ACL_SRC_0);
+    auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+    auto c = tensors.get_const_tensor(TensorType::ACL_SRC_2);
+    auto d = tensors.get_tensor(TensorType::ACL_DST);
+
+    int       lda = a->info()->strides_in_bytes().y() / sizeof(TypeInput);
     int       ldb = 0;
-    const int ldd = _d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
+    const int ldd = d->info()->strides_in_bytes().y() / sizeof(TypeOutput);
 
     const size_t a_batch_idx = _gemm_info.reinterpret_input_as_3d != 0 ? 3 : 2;
     const size_t a_multi_idx = a_batch_idx + 1;
     const size_t d_batch_idx = _gemm_info.depth_output_gemm3d != 0 ? 3 : 2;
     const size_t d_multi_idx = d_batch_idx + 1;
 
-    int       batch_stride_a = _a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
-    const int batch_stride_d = _d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
+    int       batch_stride_a = a->info()->strides_in_bytes()[a_batch_idx] / sizeof(TypeInput);
+    const int batch_stride_d = d->info()->strides_in_bytes()[d_batch_idx] / sizeof(TypeOutput);
 
-    int       multi_stride_a = _a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
+    int       multi_stride_a = a->info()->strides_in_bytes()[a_multi_idx] / sizeof(TypeInput);
     int       multi_stride_b = 0;
-    const int multi_stride_d = _d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
+    const int multi_stride_d = d->info()->strides_in_bytes()[d_multi_idx] / sizeof(TypeOutput);
 
-    auto             in0_ptr = reinterpret_cast<const TypeInput *>(_a->buffer() + _a->info()->offset_first_element_in_bytes());
+    auto             in0_ptr = reinterpret_cast<const TypeInput *>(a->buffer() + a->info()->offset_first_element_in_bytes());
     const TypeInput *in1_ptr = nullptr;
-    auto             out_ptr = reinterpret_cast<TypeOutput *>(_d->buffer() + _d->info()->offset_first_element_in_bytes());
+    auto             out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
 
     // Check if B is pre-tranposed and de-reference if not
     if(!_gemm_kernel_asm->B_is_pretransposed())
     {
-        ldb            = _b->info()->strides_in_bytes().y() / sizeof(TypeInput);
-        multi_stride_b = _b->info()->strides_in_bytes().z() / sizeof(TypeInput);
-        in1_ptr        = reinterpret_cast<const TypeInput *>(_b->buffer() + _b->info()->offset_first_element_in_bytes());
+        ldb            = b->info()->strides_in_bytes().y() / sizeof(TypeInput);
+        multi_stride_b = b->info()->strides_in_bytes().z() / sizeof(TypeInput);
+        in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
     }
 
-    const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, _d->info()->data_type());
+    const auto scheduling_hint = scheduling_hint_heuristic(_kernel_info.method, d->info()->data_type());
 
     // Set workspace if needed and reset number of threads as buffer manager gets re-created with max_threads
     if(_workspace.buffer() != nullptr)
@@ -654,13 +660,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
     }
 
     // Prepare assembly kernel
-    prepare();
+    prepare(tensors);
 
     // Setup up matrix bias in the assembly kernel, it's just a pointer to matrix C.
     TypeOutput *bias = nullptr;
-    if(_c && _c->info()->data_type() != DataType::S32)
+    if(c && c->info()->data_type() != DataType::S32)
     {
-        bias = reinterpret_cast<TypeOutput *>(_c->buffer() + _c->info()->offset_first_element_in_bytes());
+        bias = reinterpret_cast<TypeOutput *>(c->buffer() + c->info()->offset_first_element_in_bytes());
     }
 
     if(_gemm_info.method == AsmConvMethod::Indirect)
@@ -682,7 +688,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                     const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
+                     const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                      IWeightsManager *weights_manager)
 {
     Params         p           = extract_parameters(a, b, d, info);
@@ -699,7 +705,7 @@ void create_arm_gemm(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_ge
 
 template <typename TypeInput, typename TypeOutput>
 void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &arm_gemm, MemoryGroup &memory_group,
-                           const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
+                           const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, arm_gemm::Activation activation, const AsmGemmInfo &info,
                            IWeightsManager *weights_manager)
 {
     ARM_COMPUTE_UNUSED(activation);
@@ -714,8 +720,8 @@ void create_arm_gemm_quant(std::unique_ptr<CpuGemmAssemblyDispatch::IFallback> &
 
     // Configure requantization info
     const int32_t                 negation = info.negated_offsets ? 1 : -1;
-    const int32_t                 a_offset = -a->info()->quantization_info().uniform().offset * negation;
-    const int32_t                 b_offset = -b->info()->quantization_info().uniform().offset * negation;
+    const int32_t                 a_offset = -a->quantization_info().uniform().offset * negation;
+    const int32_t                 b_offset = -b->quantization_info().uniform().offset * negation;
     const GEMMLowpOutputStageInfo os_info  = info.output_stage;
 
     arm_gemm::Requantize32 gemm_requant_info{};
@@ -786,18 +792,18 @@ bool CpuGemmAssemblyDispatch::is_activation_supported(const ActivationLayerInfo
     return act.type != arm_gemm::Activation::Type::None;
 }
 
-void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info)
+void CpuGemmAssemblyDispatch::configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, d);
     arm_gemm::Activation act = map_to_arm_gemm_activation(info.activation_info);
 
     //If we don't support a combination of data types, silently return: it is the caller's responsibility to check if configure() was successful via is_configured()
-    if(!CpuGemmAssemblyDispatch::validate(a->info(), b->info(), c != nullptr ? c->info() : nullptr, d->info(), info))
+    if(!CpuGemmAssemblyDispatch::validate(a, b, c, d, info))
     {
         return;
     }
 
-    switch(a->info()->data_type())
+    switch(a->data_type())
     {
         case DataType::F32:
             create_arm_gemm<float, float>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
@@ -805,7 +811,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, cons
 #ifdef __aarch64__
         case DataType::U8:
         case DataType::QASYMM8:
-            if(d->info()->data_type() == DataType::S32)
+            if(d->data_type() == DataType::S32)
             {
                 create_arm_gemm<uint8_t, uint32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
@@ -816,7 +822,7 @@ void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, cons
             break;
         case DataType::S8:
         case DataType::QASYMM8_SIGNED:
-            if(d->info()->data_type() == DataType::S32)
+            if(d->data_type() == DataType::S32)
             {
                 create_arm_gemm<int8_t, int32_t>(_arm_gemm, _memory_group, a, b, c, d, act, info, _weights_manager);
             }
@@ -841,10 +847,10 @@ void CpuGemmAssemblyDispatch::configure(const ITensor *a, const ITensor *b, cons
     }
 }
 
-void CpuGemmAssemblyDispatch::prepare()
+void CpuGemmAssemblyDispatch::prepare(ITensorPack &tensors)
 {
     ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-    _arm_gemm->prepare();
+    _arm_gemm->prepare(tensors);
 }
 
 bool CpuGemmAssemblyDispatch::is_configured() const
@@ -852,12 +858,12 @@ bool CpuGemmAssemblyDispatch::is_configured() const
     return _arm_gemm != nullptr && _arm_gemm->is_configured();
 }
 
-void CpuGemmAssemblyDispatch::run()
+void CpuGemmAssemblyDispatch::run(ITensorPack &tensors)
 {
     MemoryGroupResourceScope scope_mg(_memory_group);
 
     ARM_COMPUTE_ERROR_ON(_arm_gemm == nullptr);
-    _arm_gemm->run();
+    _arm_gemm->run(tensors);
 }
 } // namespace cpu
 } // namespace arm_compute
diff --git a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index 0bbae49a7e..ffc097c75c 100644
--- a/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/runtime/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -21,14 +21,15 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef SRC_NEGEMMASSEMBLYDISPATCH_H
-#define SRC_NEGEMMASSEMBLYDISPATCH_H
+#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
+#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
 
-#include "arm_compute/runtime/IFunction.h"
 #include "arm_compute/runtime/IMemoryManager.h"
 #include "arm_compute/runtime/IWeightsManager.h"
 #include "arm_compute/runtime/MemoryGroup.h"
 #include "arm_compute/runtime/Tensor.h"
+#include "src/core/common/Macros.h"
+#include "src/runtime/cpu/ICpuOperator.h"
 
 namespace arm_compute
 {
@@ -57,29 +58,23 @@ struct AsmGemmInfo
 };
 
 /** Assembly kernel glue */
-class CpuGemmAssemblyDispatch : public IFunction
+class CpuGemmAssemblyDispatch : public ICpuOperator
 {
 public:
     /** Constructor */
     CpuGemmAssemblyDispatch(std::shared_ptr<IMemoryManager> memory_manager = nullptr, IWeightsManager *weights_manager = nullptr);
-    /** Prevent instances of this class from being copy constructed */
-    CpuGemmAssemblyDispatch(const CpuGemmAssemblyDispatch &) = delete;
-    /** Prevent instances of this class from being copied */
-    CpuGemmAssemblyDispatch &operator=(const CpuGemmAssemblyDispatch &) = delete;
-    /** Default move constructor */
-    CpuGemmAssemblyDispatch(CpuGemmAssemblyDispatch &&) = default;
-    /** Default move assignment operator */
-    CpuGemmAssemblyDispatch &operator=(CpuGemmAssemblyDispatch &&) = default;
     /** Defautl destructor */
     ~CpuGemmAssemblyDispatch() = default;
 
+    ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(CpuGemmAssemblyDispatch);
+
     class IFallback
     {
     public:
-        virtual void run()                 = 0;
-        virtual void prepare()             = 0;
-        virtual bool is_configured() const = 0;
-        virtual ~IFallback()               = default;
+        virtual void run(ITensorPack &tensors)     = 0;
+        virtual void prepare(ITensorPack &tensors) = 0;
+        virtual bool is_configured() const         = 0;
+        virtual ~IFallback()                       = default;
     };
 
 public:
@@ -91,7 +86,7 @@ public:
      * @param[out] d    Output tensor to store the result of matrix multiplication. Data type supported: same as @p input0.
      * @param[in]  info GEMM meta-data
      */
-    void configure(const ITensor *a, const ITensor *b, const ITensor *c, ITensor *d, const AsmGemmInfo &info);
+    void configure(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, ITensorInfo *d, const AsmGemmInfo &info);
 
     /** Indicates whether or not this function can be used to process the given parameters.
      *
@@ -118,8 +113,8 @@ public:
     bool is_configured() const;
 
     // Inherited methods overridden:
-    void prepare() override;
-    void run() override;
+    void prepare(ITensorPack &tensors) override;
+    void run(ITensorPack &tensors) override;
 
 private:
     std::unique_ptr<IFallback> _arm_gemm;        /**< Interface for the arm_gemm fallback */
@@ -128,4 +123,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* SRC_NEGEMMASSEMBLYDISPATCH_H */
+#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */
author	Sang-Hoon Park <sang-hoon.park@arm.com>	2021-05-17 17:04:50 +0100
committer	Sang-Hoon Park <sang-hoon.park@arm.com>	2021-05-26 10:16:05 +0000
commit	d89e2faa60d148f3c04e57032a28f1065a1be0e8 (patch)
tree	c95eb97f9c79198cb5db1232b497491df10614f2
parent	8b83d4684249bb96e27f95e11cf8f38a1c33b82b (diff)
download	ComputeLibrary-d89e2faa60d148f3c04e57032a28f1065a1be0e8.tar.gz