15 files changed, 665 insertions, 228 deletions
diff --git a/src/cpu/operators/CpuFullyConnected.h b/src/cpu/operators/CpuFullyConnected.h
index 7073fb9f7c..b72f77e5c4 100644
--- a/src/cpu/operators/CpuFullyConnected.h
+++ b/src/cpu/operators/CpuFullyConnected.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_FULLY_CONNECTED_H
-#define ARM_COMPUTE_CPU_FULLY_CONNECTED_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
+#define ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/FullyConnectedLayerInfo.h"
@@ -145,13 +145,15 @@ private:
     {
         AsmGemmWorkspace = 0,
         Pretranspose,
-        GemmTemp1, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp2, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp3, // Both CpuGemm and CpuGemmLowpMatrixMultiplyCore
-        GemmTemp4, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp5, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp6, // CpuGemmLowpMatrixMultiplyCore only
-        GemmTemp7, // CpuGemmLowpMatrixMultiplyCore only
+        GemmTemp1,
+        GemmTemp2,
+        GemmTemp3,
+        GemmTemp4,
+        GemmTemp5,
+        GemmTemp6,
+        GemmTemp7,
+        GemmTemp8,
+        // Slots above (0-9) reserved for either CpuGemm or CpuGemmLowpMatrixMultiplyCore
         TransposedWeights,
         ConvertedWeights,
         FlattenedSrc,
@@ -189,4 +191,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_FULLY_CONNECTED_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUFULLYCONNECTED_H
diff --git a/src/cpu/operators/CpuGemm.cpp b/src/cpu/operators/CpuGemm.cpp
index 8da166dbef..e035de0131 100644
--- a/src/cpu/operators/CpuGemm.cpp
+++ b/src/cpu/operators/CpuGemm.cpp
@@ -53,6 +53,8 @@ cpu::AsmGemmInfo init_assembly_metadata(const GEMMInfo &info)
     asm_info.fast_mode               = info.fast_math();
     asm_info.fixed_format            = info.fixed_format();
     asm_info.weight_format           = info.weight_format();
+    asm_info.transpose_b =
+        info.pretranspose_B(); // The "pretranspose_B" flag here is not the same as the pretranspose_B_array method. The flag here signals to pretranspose_B_array method if we want to perform additional transpose on B before the pretranspose_B_array method
 
     return asm_info;
 }
@@ -72,7 +74,7 @@ void CpuGemm::configure(const ITensorInfo *a,
 
     const cpu::AsmGemmInfo asm_info  = init_assembly_metadata(gemm_info);
     const bool             is_c_bias = beta == 1 && c != nullptr;
-    bool                   run_optimised =
+    const bool             run_optimised =
         bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, (is_c_bias) ? c : nullptr, d, asm_info)) &&
         (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
         !(!b->are_values_constant() &&
@@ -92,14 +94,17 @@ void CpuGemm::configure(const ITensorInfo *a,
 
     if (run_optimised)
     {
+        _run_interleave_transpose   = false;
         const ITensorInfo *c_to_use = is_c_bias ? c : nullptr;
         _asm_glue                   = std::make_unique<cpu::CpuGemmAssemblyDispatch>();
         _asm_glue->configure(a, b, c_to_use, d, asm_info);
         ARM_COMPUTE_ERROR_ON(!_asm_glue->is_configured());
 
-        auto asm_mem_req           = _asm_glue->workspace();
-        _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-        _aux_mem[Pretraspose]      = asm_mem_req[Pretraspose];
+        const auto asm_mem_req = _asm_glue->workspace();
+        for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = asm_mem_req[slot];
+        }
 
         // Scale product by alpha
         if (_run_alpha_scale)
@@ -111,37 +116,74 @@ void CpuGemm::configure(const ITensorInfo *a,
     }
     else
     {
+        _run_interleave_transpose = !_run_vector_matrix_multiplication;
         // Pick output tensor in case bias addition should be performed
         ITensorInfo *gemm_output_to_use = (_run_bias_addition) ? &_tmp_d : d;
+        // Pick b tensor in case pretranspose should be performed
+        const ITensorInfo *b_to_use = b;
 
         _mm_kernel = std::make_unique<cpu::kernels::CpuGemmMatrixMultiplyKernel>();
 
+        // Configure rhs pretranspose
+        if (gemm_info.pretranspose_B())
+        {
+            _pretranspose_b_func = std::make_unique<CpuTranspose>();
+            _pretranspose_b_func->configure(b_to_use, &_pretransposed_b);
+            MemoryLifetime lifetime;
+            if (_reshape_b_only_on_first_run)
+            {
+                if (_run_interleave_transpose)
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is then succeeded by Transposed1xWRHS
+                    // So PreTransposedRHS can be freed inside prepare()
+                    lifetime = MemoryLifetime::Prepare;
+                }
+                else
+                {
+                    // PreTransposedRHS tensor is only used in prepare(), but is the final transformation of rhs
+                    // So PreTransposedRHS needs to persist beyond prepare()
+                    lifetime = MemoryLifetime::Persistent;
+                }
+            }
+            else
+            {
+                // PreTransposedRHS tensor is always used in run() and doesn't need to persist
+                lifetime = MemoryLifetime::Temporary;
+            }
+            _aux_mem[PreTransposedRHS] =
+                MemoryInfo(offset_int_vec(PreTransposedRHS), lifetime, _pretransposed_b.total_size());
+            b_to_use = &_pretransposed_b;
+        }
+
         // Select between GEMV and GEMM
         if (_run_vector_matrix_multiplication)
         {
             // Configure the matrix multiply kernel
-            _mm_kernel->configure(a, b, gemm_output_to_use, alpha, false);
+            _mm_kernel->configure(a, b_to_use, gemm_output_to_use, alpha, false);
         }
         else
         {
-            const int m = a->dimension(1);
-            const int n = b->dimension(0);
-            const int k = a->dimension(0);
-
+            ARM_COMPUTE_ERROR_ON(!_run_interleave_transpose);
             // Configure interleave kernel
             _interleave_kernel = std::make_unique<cpu::kernels::CpuGemmInterleave4x4Kernel>();
             _interleave_kernel->configure(a, &_tmp_a);
             _aux_mem[InterleavedLHS] =
                 MemoryInfo(offset_int_vec(InterleavedLHS), MemoryLifetime::Temporary, _tmp_a.total_size());
 
-            // Configure transpose kernel
-            _transpose_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
-            _transpose_kernel->configure(b, &_tmp_b);
-            _aux_mem[TransposedRHS] =
-                MemoryInfo(offset_int_vec(TransposedRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+            // Configure rhs transpose1xw kernel
+            _transpose1xW_b_kernel = std::make_unique<cpu::kernels::CpuGemmTranspose1xWKernel>();
+            _transpose1xW_b_kernel->configure(b_to_use, &_tmp_b);
+            _aux_mem[Transposed1xWRHS] =
+                MemoryInfo(offset_int_vec(Transposed1xWRHS), MemoryLifetime::Persistent, _tmp_b.total_size());
+
+            // Use a and b here instead of _tmp_a and _tmp_b because CpuGemmMatrixMultiplyKernel requires the original m,n,k in case of interleaved a and transposed1xw b
+            const int m = a->dimension(1);
+            const int n = b_to_use->dimension(0);
+            const int k = a->dimension(0);
 
             // Configure matrix multiplication kernel
-            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, true, GEMMReshapeInfo(m, n, k));
+            _mm_kernel->configure(&_tmp_a, &_tmp_b, gemm_output_to_use, alpha, _run_interleave_transpose,
+                                  GEMMReshapeInfo(m, n, k));
         }
 
         if (_run_bias_addition)
@@ -179,6 +221,16 @@ Status CpuGemm::validate(const ITensorInfo *a,
     ARM_COMPUTE_UNUSED(alpha);
     const bool is_c_bias    = beta == 1 && c != nullptr;
     const bool run_addition = c != nullptr && beta != 0 && beta != 1;
+    // Check if we should use the pretransposed_b or original b
+    // TODO: COMPMID-6597
+    // Note that this check should only apply to the non-optimized path. The reason we brought this at the beginning
+    // instead of only for the fallback path is because of the checks performed below, between here and the run_optimised decision
+    // We should simplify this by
+    //   1. Moving the checks between "fix-start" and "fix-end" into their corresponding ops / kernels (e.g. the weights format checks can and should be moved into CpuGemmAssemblyDispatch)
+    //   2. Moving this b_to_use check back into the non-optimized path
+    TensorInfo pretransposed_b  = b->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*b));
+    const ITensorInfo *b_to_use = gemm_info.pretranspose_B() ? &pretransposed_b : b;
+    // TODO: COMPMID-6597 fix-start
 
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(a);
     ARM_COMPUTE_RETURN_ERROR_ON_CPU_BF16_UNSUPPORTED(a);
@@ -187,16 +239,16 @@ Status CpuGemm::validate(const ITensorInfo *a,
     if (is_fixed_format_fast_math(gemm_info.weight_format()))
     {
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(a, DataType::F32);
-        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b, DataType::BFLOAT16);
+        ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(b_to_use, DataType::BFLOAT16);
     }
     else
     {
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, b_to_use);
     }
 
     const int block_by = arm_compute::block_by(gemm_info.weight_format());
     // test if im2col has changed the dimensions that are needed for padding
-    if (a->dimension(0) != b->dimension(1) && block_by > 1)
+    if (a->dimension(0) != b_to_use->dimension(1) && block_by > 1)
     {
         // have to verify bias
         const size_t dim0_sz = a->dimension(0);
@@ -204,18 +256,18 @@ Status CpuGemm::validate(const ITensorInfo *a,
             (dim0_sz % block_by) != 0,
             ("The matrix A number of columns must be a multiple of block_by=" + std::to_string(block_by)).c_str());
         // a->dimension(0) = kernel_area * input_channel + kernel_area * input_pad_right
-        // b->dimension(1) = kernel_area * input_channel
-        // a->dimension(0) = b->dimension(1) + kernel_area * input_pad_right
-        const size_t input_pad_right = (dim0_sz - b->dimension(1)) % block_by;
-        const size_t kernel_area     = (dim0_sz - b->dimension(1)) / input_pad_right;
+        // b_to_use->dimension(1) = kernel_area * input_channel
+        // a->dimension(0) = b_to_use->dimension(1) + kernel_area * input_pad_right
+        const size_t input_pad_right = (dim0_sz - b_to_use->dimension(1)) % block_by;
+        const size_t kernel_area     = (dim0_sz - b_to_use->dimension(1)) / input_pad_right;
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            (dim0_sz - kernel_area * input_pad_right) != b->dimension(1),
+            (dim0_sz - kernel_area * input_pad_right) != b_to_use->dimension(1),
             "The product AB is defined only if A number of columns and B number of rows are related");
     }
     else
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(
-            a->dimension(0) != b->dimension(1),
+            a->dimension(0) != b_to_use->dimension(1),
             "The product AB is defined only if the number of columns in A is equal to the number of rows in B");
     }
 
@@ -233,14 +285,14 @@ Status CpuGemm::validate(const ITensorInfo *a,
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(c, d);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1),
                                         "The C matrix must have the same number of rows as the matrix A");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b->dimension(0) != c->dimension(0),
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(b_to_use->dimension(0) != c->dimension(0),
                                         "The C matrix must have the same number of columns as the matrix B");
     }
 
     if (d->total_size() != 0)
     {
         // For fixed format we are expecting some kind of blocked format for B/RHS so the dimension won't necessarily match the result matrix any more.
-        ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b->dimension(0) != d->dimension(0));
+        ARM_COMPUTE_RETURN_ERROR_ON(!gemm_info.fixed_format() && b_to_use->dimension(0) != d->dimension(0));
         if (gemm_info.depth_output_gemm3d() != 0)
         {
             if (gemm_info.reinterpret_input_as_3d())
@@ -258,10 +310,14 @@ Status CpuGemm::validate(const ITensorInfo *a,
             ARM_COMPUTE_RETURN_ERROR_ON(a->dimension(1) != d->dimension(1));
         }
     }
+    // TODO: COMPMID-6597 fix-end
 
     // Check if we need to run the optimized assembly kernel
     cpu::AsmGemmInfo asm_info = init_assembly_metadata(gemm_info);
-    const bool       run_optimised =
+
+    // Note we use b instead of b_to_use here because asm_info also captures the pretranspose_b() flag
+    // so we pass the original b to CpuGemmAssemblyDispatch
+    const bool run_optimised =
         bool(cpu::CpuGemmAssemblyDispatch::validate(a, b, is_c_bias ? c : nullptr, d, asm_info)) &&
         (c == nullptr || beta == 0.f || beta == 1.f) && // Optimized GeMM doesn't support beta coefficient.
         !(!b->are_values_constant() &&
@@ -277,13 +333,13 @@ Status CpuGemm::validate(const ITensorInfo *a,
         // Check if the first input tensor is a vector.
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
         // Check if we need to reshape the matrix A and matrix B
-        const bool run_interleave_transpose = !run_vector_matrix_multiplication && !b->are_values_constant();
+        const bool run_interleave_transpose = !run_vector_matrix_multiplication;
 
         // Arguments used by GEMMReshapeInfo
         // If we pass the matrix A and matrix B reshaped to CpuGemmMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to GEMMReshapeInfo
         // in order to know how the matrices have been reshaped
         const int m                         = a->dimension(1);
-        const int n                         = b->dimension(0);
+        const int n                         = b_to_use->dimension(0);
         const int k                         = a->dimension(0);
         int       mult_transpose1xW_width   = 1;
         int       mult_interleave4x4_height = 1;
@@ -292,7 +348,7 @@ Status CpuGemm::validate(const ITensorInfo *a,
             m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, gemm_info.depth_output_gemm3d());
 
         const ITensorInfo *matrix_a_info = a;
-        const ITensorInfo *matrix_b_info = b;
+        const ITensorInfo *matrix_b_info = b_to_use;
 
         TensorInfo tmp_a_info{};
         TensorInfo tmp_b_info{};
@@ -309,9 +365,10 @@ Status CpuGemm::validate(const ITensorInfo *a,
             ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmInterleave4x4Kernel::validate(a, &tmp_a_info));
 
             // Validate transpose kernel
-            auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_transpose1xW_with_element_size_shape(
-                                               *b, mult_transpose1xW_width)));
-            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b, &tmp_b_info));
+            auto_init_if_empty(tmp_b_info,
+                               b_to_use->clone()->set_tensor_shape(
+                                   compute_transpose1xW_with_element_size_shape(*b_to_use, mult_transpose1xW_width)));
+            ARM_COMPUTE_RETURN_ON_ERROR(cpu::kernels::CpuGemmTranspose1xWKernel::validate(b_to_use, &tmp_b_info));
         }
 
         // Validate matrix multiply
@@ -367,29 +424,46 @@ void CpuGemm::run(ITensorPack &tensors)
     else
     {
         CpuAuxTensorHandler interleaved_a(offset_int_vec(InterleavedLHS), _tmp_a, tensors, true);
-        CpuAuxTensorHandler transposed_b(offset_int_vec(TransposedRHS), _tmp_b, tensors, true);
+        CpuAuxTensorHandler pretransposed_b(offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors);
+        CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors, true);
         CpuAuxTensorHandler temp_d(offset_int_vec(TempResult), _tmp_d, tensors, true);
 
         ITensorPack mm_pack{{ACL_SRC_0, a}, {ACL_SRC_1, b}, {ACL_DST, (_run_bias_addition) ? temp_d.get() : d}};
-        if (!_run_vector_matrix_multiplication)
+
+        if (_run_interleave_transpose)
         {
             // Run interleave kernel
             ITensorPack interleave_pack{{ACL_SRC, a}, {ACL_DST, interleaved_a.get()}};
             NEScheduler::get().schedule_op(_interleave_kernel.get(), Window::DimY, _interleave_kernel->window(),
                                            interleave_pack);
+            // Use reshaped matrices
+            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
+        }
 
+        const ITensor *b_to_use = b;
+        if (_pretranspose_b_func)
+        {
             if (!_reshape_b_only_on_first_run)
             {
-                // Run transpose kernel
-                ITensorPack transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
-                NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
-                                               transpose_pack);
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
             }
-
-            // Use reshaped matrices
-            mm_pack.add_const_tensor(ACL_SRC_0, interleaved_a.get());
-            mm_pack.add_const_tensor(ACL_SRC_1, transposed_b.get());
+            b_to_use = pretransposed_b.get();
+        }
+        if (_run_interleave_transpose)
+        {
+            if (!_reshape_b_only_on_first_run)
+            {
+                // Run transpose1xw kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
+            b_to_use = transposed1xw_b.get();
         }
+        // Use reshaped matrices
+        mm_pack.add_const_tensor(ACL_SRC_1, b_to_use);
 
         NEScheduler::get().schedule_op(_mm_kernel.get(),
                                        _run_vector_matrix_multiplication ? Window::DimX : Window::DimY,
@@ -426,17 +500,32 @@ void CpuGemm::prepare(ITensorPack &tensors)
         {
             _asm_glue->prepare(tensors);
         }
-        else if (_reshape_b_only_on_first_run && !_run_vector_matrix_multiplication)
+        else if (_reshape_b_only_on_first_run)
         {
-            const ITensor *b = tensors.get_const_tensor(ACL_SRC_1);
-            ITensor       *b_aux =
-                utils::cast::polymorphic_cast<ITensor *>(tensors.get_tensor(offset_int_vec(TransposedRHS)));
-            ARM_COMPUTE_ERROR_ON_NULLPTR(b, b_aux);
-
-            CpuAuxTensorHandler transposed_b(_tmp_b, *b_aux);
-            ITensorPack         transpose_pack{{ACL_SRC, b}, {ACL_DST, transposed_b.get()}};
-            NEScheduler::get().schedule_op(_transpose_kernel.get(), Window::DimY, _transpose_kernel->window(),
-                                           transpose_pack);
+            const ITensor      *b        = tensors.get_const_tensor(ACL_SRC_1);
+            const ITensor      *b_to_use = b;
+            CpuAuxTensorHandler pretransposed_b(
+                offset_int_vec(PreTransposedRHS), _pretransposed_b, tensors,
+                false /*pack_inject: no need to inject into tensors*/,
+                _pretranspose_b_func ==
+                    nullptr /*bypass_alloc: no need to allocate if _pretranspose_b_func is not run*/);
+            CpuAuxTensorHandler transposed1xw_b(offset_int_vec(Transposed1xWRHS), _tmp_b, tensors,
+                                                false /*pack_inject*/, !_run_interleave_transpose /*bypass_alloc*/);
+
+            if (_pretranspose_b_func)
+            {
+                // Run pretranspose kernel
+                ITensorPack pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pretransposed_b.get()}};
+                _pretranspose_b_func->run(pretranspose_pack);
+                b_to_use = pretransposed_b.get();
+            }
+            if (_run_interleave_transpose)
+            {
+                // Run transpose kernel
+                ITensorPack transpose_pack{{ACL_SRC, b_to_use}, {ACL_DST, transposed1xw_b.get()}};
+                NEScheduler::get().schedule_op(_transpose1xW_b_kernel.get(), Window::DimY,
+                                               _transpose1xW_b_kernel->window(), transpose_pack);
+            }
         }
         _is_prepared = true;
     }
diff --git a/src/cpu/operators/CpuGemm.h b/src/cpu/operators/CpuGemm.h
index 6b30d134fa..a05258d206 100644
--- a/src/cpu/operators/CpuGemm.h
+++ b/src/cpu/operators/CpuGemm.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMM_H
-#define ARM_COMPUTE_CPU_GEMM_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMM_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMM_H
 
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -36,6 +36,7 @@
 #include "src/cpu/kernels/CpuGemmTranspose1xWKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuAdd.h"
+#include "src/cpu/operators/CpuTranspose.h"
 #include "src/cpu/operators/internal/CpuGemmAssemblyDispatch.h"
 
 #include <memory>
@@ -144,16 +145,17 @@ public:
 private:
     enum AuxTensorIdx
     {
-        AsmGemmWorkspace = 0,
-        Pretraspose,
-        InterleavedLHS,
-        TransposedRHS,
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        InterleavedLHS = 3,
+        PreTransposedRHS,
+        Transposed1xWRHS,
         TempResult,
         Count
     };
 
     std::unique_ptr<kernels::CpuGemmInterleave4x4Kernel>  _interleave_kernel{nullptr};
-    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose_kernel{nullptr};
+    std::unique_ptr<CpuTranspose>                         _pretranspose_b_func{nullptr};
+    std::unique_ptr<kernels::CpuGemmTranspose1xWKernel>   _transpose1xW_b_kernel{nullptr};
     std::unique_ptr<kernels::CpuGemmMatrixMultiplyKernel> _mm_kernel{nullptr};
     std::unique_ptr<CpuGemmAssemblyDispatch>              _asm_glue{nullptr};
     std::unique_ptr<kernels::CpuGemmMatrixAdditionKernel> _ma_kernel{nullptr};
@@ -162,10 +164,13 @@ private:
     std::unique_ptr<CpuActivation>                        _activation_func{nullptr};
 
     TensorInfo _tmp_a{};
+    TensorInfo _pretransposed_b{};
     TensorInfo _tmp_b{};
     TensorInfo _tmp_d{};
 
     bool _run_vector_matrix_multiplication{false};
+    bool _run_interleave_transpose{
+        true}; /**< If we run CpuGemmInterleave4x4Kernel on lhs and CpuGemmTranspose1xWKernel on rhs */
     bool _run_alpha_scale{false};
     bool _run_addition{false};
     bool _run_bias_addition{false};
@@ -177,4 +182,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_GEMM_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMM_H
diff --git a/src/cpu/operators/CpuGemmConv2d.cpp b/src/cpu/operators/CpuGemmConv2d.cpp
index 7c59d88c61..117527ccc1 100644
--- a/src/cpu/operators/CpuGemmConv2d.cpp
+++ b/src/cpu/operators/CpuGemmConv2d.cpp
@@ -32,7 +32,9 @@
 #include "arm_compute/runtime/NEON/NEScheduler.h"
 
 #include "src/common/utils/Log.h"
+#include "src/core/helpers/AutoConfiguration.h"
 #include "src/core/helpers/MemoryHelpers.h"
+#include "src/core/helpers/Utils.h"
 #include "src/cpu/kernels/CpuCol2ImKernel.h"
 #include "src/cpu/kernels/CpuIm2ColKernel.h"
 #include "src/cpu/kernels/CpuWeightsReshapeKernel.h"
@@ -52,6 +54,117 @@ namespace arm_compute
 {
 namespace cpu
 {
+
+/** @section note_CpuGemmConv2d_weight_transformation Weight Transformations in CpuGemmConv2d
+ *
+ * A. Terminology
+ *      Throughout CpuGemmConv2d, we use the following terms in ways that may differ from other operators / kernels:
+ *          - "Transform" or "Reshape" of the weights: they both mean all the operations that we perform on the weight
+ *             tensor up until they are consumed by gemm (CpuGemm or CpuGemmLowpMatrixMultiplyCore)
+ *             Note that the specific gemm operator may perform further transformations on the weights, but the
+ *             transformations here only mean those performed in CpuGemmConv2d
+ *          - "Transpose" of weights: The @ref CpuTranspose operation. I.e. transpose of the weights' lowest two
+ *             dimensions
+ *
+ * B. Gemm-based conv2d
+ *      We want to convert the 2d convolution op (ignoring bias):
+ *          dst = conv2d(src, weight)
+ *      into a matrix multiplication op:
+ *          gemm_dst = gemm(lhs, rhs)
+ *
+ *      E.g.: For data layout NHWC
+ *                               3 (hi) <----------> (lo) 0
+ *               src.shape =    [batch,  in_h , in_w,  in_c]
+ *               weight.shape = [out_c,   k_h ,  k_w,  in_c]
+ *               dst.shape =    [batch, out_h, out_w, out_c]
+ *
+ *      This requires three transformations:
+ *          * src -> lhs, transform conv input to gemm lhs; gemm_lhs is a 2d matrix where each row (or column,
+ *                          depending on the convention) is a linearized "patch" of the conv_input that corresponds to
+ *                          the receptive field of the corresponding output element.
+ *                          The convention is to use "column", but to disambiguate from the column vector of a matrix,
+ *                          in this documentation we shall use "patch".
+ *                          This transform is called im2col (for details see @ref CpuIm2ColKernel)
+ *          * weight -> rhs, transform conv weight to gemm rhs, known as weight transform/reshape (wt)
+ *          * gemm_dst -> dst, transform gemm output back to conv output, known as col2im (for details see
+ *                          @ref CpuCol2ImKernel)
+ *
+ *      This section focuses on the weight transformation and assumes the im2col is already performed
+ *
+ * C. Weight Transformation
+ *      After im2col, assume: lhs.shape = [num_patch, patch_size],
+ *          where patch_size is the number of elements in a "patch": patch_size = k_h * k_w * in_c
+ *                num_patch is the number of patches; we can ignore it here (for details see @ref CpuIm2ColKernel)
+ *
+ *      After wt, rhs should have the shape: rhs = [patch_size, out_c]
+ *
+ *      Therefore, the weight transformation consists of two steps:
+ *          1. Collapsing all 3 spatial dimensions: [out_c, k_h, k_w, in_c] -> [out_c, patch_size]
+ *          2. Transpose the collapsed shape: [out_c, patch_size] -> [patch_size, out_c]
+ *
+ * D. Implementation
+ *      There are 4 paths for weight transformation
+ *
+ *      1. Path 1: Fixed weight format - no transformation
+ *          The underlying gemm kernel may adopt fixed weight format (isVarWeightsKernel() == true), which requires
+ *          that no weight transformation shall be performed
+ *          Note that this no-transform requirement applies both to this op (CpuGemmConv2d) and the constituent ops, up
+ *          until the fixed format kernels themselves
+ *
+ *      2. Path 2: Reinterpret then transpose later
+ *          If the weight tensor has no "holes" (see @ref has_holes), there are two optimizations we can apply:
+ *              - We can ignore the first step (collapsing of spatial dimensions) by simply re-interpreting the shape
+ *                in TensorInfo
+ *              - Instead of performing transpose here, we can pass the transpose flag to the underlying gemm. The gemm
+ *                may then decide to fuse the transpose with any further transformations
+ *
+ *      3. Path 3: Reshape then transpose later
+ *          If the weight tensor has holes, then we use a dedicated @ref CpuReshape, followed by transpose later
+ *
+ *      4. Path 4: Fused reshape and transpose
+ *          This is only for quantized types for now (TODO: Remove (COMPMID-6596)). We fall back to a legacy
+ *          non-optimized kernel @ref CpuWeightsReshapeKernel to perform a fused reshape + transpose
+ *
+ *      Path 1 is the long term solution that we shall migrate to once (if) we adopt fixed weight format for all gemm
+ *      kernels.
+ *      In the short term, Path 2 is the favored, more performant path.
+ */
+
+namespace
+{
+/** Initialize reshaped / transformed weight info
+ *
+ * @param[in]  weights          Input weights
+ * @param[out] reshaped_weights Transformed weights
+ */
+void initialize_reshaped_weight_info(const ITensorInfo &weights, ITensorInfo &reshaped_weights)
+{
+    auto_init_if_empty(reshaped_weights, weights);
+    if (is_data_type_quantized(weights.data_type()))
+    {
+        // WT method: FusedReshapeAndTranspose
+        reshaped_weights.set_tensor_shape(compute_weights_reshaped_shape(weights, /* has_bias */ false));
+    }
+    else
+    {
+        TensorShape collapsed_weights = weights.tensor_shape();
+        collapsed_weights.collapse(3);
+        reshaped_weights.set_tensor_shape(collapsed_weights);
+    }
+}
+} // namespace
+
+CpuGemmConv2d::WeightTransformMethod CpuGemmConv2d::get_wt_method(const ITensorInfo &weights)
+{
+    // TODO: Extend ReinterpretThenTranspose support for quantized data types COMPMID-6596
+    if (is_data_type_quantized(weights.data_type()))
+    {
+        return WeightTransformMethod::FusedReshapeAndTranspose;
+    }
+    return has_holes(weights) ? WeightTransformMethod::ReshapeThenTranspose
+                              : WeightTransformMethod::ReinterpretThenTranspose;
+}
+
 CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo         *src,
                                                         const ITensorInfo         *weights,
                                                         const PadStrideInfo       &conv_info,
@@ -96,7 +209,8 @@ CpuGemmConv2d::SkipInfo CpuGemmConv2d::skip_im_col_info(const ITensorInfo
 }
 
 CpuGemmConv2d::CpuGemmConv2d()
-    : _weights_reshape_kernel(nullptr),
+    : _weights_reshape(nullptr),
+      _weights_reshape_and_transpose_kernel(nullptr),
       _im2col_kernel(),
       _mm_gemm(),
       _mm_gemmlowp(),
@@ -111,6 +225,8 @@ CpuGemmConv2d::CpuGemmConv2d()
       _skip_col2im(false),
       _is_quantized(false),
       _is_prepared(false),
+      _wt_method(WeightTransformMethod::ReshapeThenTranspose),
+      _run_wt(true),
       _aux_mem(AuxTensorIdx::Count)
 {
 }
@@ -130,12 +246,6 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo         *src,
     ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, act_info, enable_fast_math, gemm_3d_depth,
                                            _skip_im2col, fixed_format, weight_format));
 
-    // Create GEMMInfo structure
-    const GEMMInfo &gemm_info =
-        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
-                 _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
-                 false, enable_fast_math, false, act_info, fixed_format, weight_format);
-
     // Supported activations in GEMM
     const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = {
         ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
@@ -184,7 +294,8 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo         *src,
         _mm_gemmlowp = std::make_unique<CpuGemmLowpMatrixMultiplyCore>();
         _mm_gemmlowp->configure(&tmp_src, &tmp_weights, biases, dst,
                                 GEMMInfo(false, false, true, gemm_3d_depth, _skip_im2col, false, output_info, false,
-                                         enable_fast_math, false, act_info, fixed_format, weight_format));
+                                         enable_fast_math, false, act_info, fixed_format, weight_format,
+                                         false /* pretranspose_B. TODO: COMPMID-6596 */));
 
         auto mm_mem_req = _mm_gemmlowp->workspace();
         for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont)
@@ -194,6 +305,13 @@ void CpuGemmConv2d::configure_mm(const ITensorInfo         *src,
     }
     else
     {
+        // Create GEMMInfo structure
+        const GEMMInfo &gemm_info =
+            GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                     _skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                     GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format,
+                     true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this
+                     flag is ignored)*/);
         // Configure matrix multiply function
         _mm_gemm = std::make_unique<CpuGemm>();
         _mm_gemm->configure(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
@@ -220,12 +338,6 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo         *src,
     const bool     is_quantized          = is_data_type_quantized_asymmetric(data_type);
     const bool     is_activation_enabled = act_info.enabled();
 
-    // Create GEMMInfo structure
-    const GEMMInfo gemm_info =
-        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
-                 skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
-                 false, enable_fast_math, false, act_info, fixed_format, weight_format);
-
     if (is_quantized)
     {
         // Since we need negative offsets for computing convolution, we need to change QuantizationInfo()
@@ -266,10 +378,19 @@ Status CpuGemmConv2d::validate_mm(const ITensorInfo         *src,
 
         return CpuGemmLowpMatrixMultiplyCore::validate(input_qa.get(), weights_qa.get(), biases, dst,
                                                        GEMMInfo(false, false, true, gemm_3d_depth, skip_im2col, false,
-                                                                output_info, false, enable_fast_math, false, act_info));
+                                                                output_info, false, enable_fast_math, false, act_info,
+                                                                false /* pretranspose_B. TODO: COMPMID-6596 */));
     }
     else
     {
+        // Create GEMMInfo structure
+        const GEMMInfo gemm_info =
+            GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                     skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                     GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info, fixed_format, weight_format,
+                     true /*pretranspose_B. For fp gemm (wt path 1 - 3), We always pretranspose B (for wt path 1 this
+                     flag is ignored)*/);
+
         // Perform validation step on Matrix multiply function
         return CpuGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info);
     }
@@ -353,13 +474,8 @@ void CpuGemmConv2d::configure(const ITensorInfo         *src,
     unsigned int stride_y        = 0;
     std::tie(stride_x, stride_y) = conv_info.stride();
 
-    unsigned int mat_weights_cols = weights->dimension(idx_kernels);
-
-    // _weights_reshaped will be auto configured in the kernel.
-    // Just append biases and do not transpose 1xW as it will be reshaped in CpuGemm
-    _weights_reshape_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
-    _weights_reshape_kernel->configure(weights, nullptr, &_weights_reshaped);
-    _weights_reshaped.set_quantization_info(weights->quantization_info());
+    // Initialize reshaped weights
+    initialize_reshaped_weight_info(*weights, _weights_reshaped);
 
     // Create tensor to store im2col reshaped inputs
     if (!_skip_im2col)
@@ -380,6 +496,8 @@ void CpuGemmConv2d::configure(const ITensorInfo         *src,
         gemm_input_to_use = &_im2col_output;
     }
 
+    const unsigned int mat_weights_cols = weights->dimension(idx_kernels);
+
     // Create temporary GEMM output tensor in case we cannot skip col2im
     const DataType output_data_type = data_type == DataType::BFLOAT16 ? DataType::F32 : data_type;
     if (!_skip_col2im)
@@ -412,9 +530,38 @@ void CpuGemmConv2d::configure(const ITensorInfo         *src,
     // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
     const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
+    /** @section note_CpuGemmConv2d_weight_use_in_configure  Which weights tensor should we use to configure gemm
+     *
+     *  A. The problem:
+     *      In principle, we should use the weights tensor corresponding to the weights transformation path. I.e.:
+     *          - If no weight transformation (_run_wt == false): Use original weights
+     *          - else:                                           Use transformed weights
+     *      However in practice we have a dilemma:
+     *          - We need to know _run_wt before we can configure gemm with the corresponding weights, but
+     *          - _run_wt depends on isVarWeightsKernel(), which is only known after gemm is configured
+     *
+     *  B. The decision:
+     *      To simplify the matter, we decide to always use the transformed weights, regardless of _run_wt
+     *
+     *      This decision requires the following conditions:
+     *          1. The underlying gemm where isVarWeightsKernel() == true, must guarantee that:
+     *              A. Ignore the flag to transpose weights (GEMMInfo::pretranspose_B)
+     *              B. Use weights/B tensor passed to it at prepare() or run() instead of that passed at configure()
+     *          2. CpuGemmConv2d where isVarWeightsKernel() == true, must guarantee that:
+     *              A. Pass original weights instead of reshaped or reinterpreted weights
+     *
+     *  C. Future actions:
+     *      Condition 2 is a given, based on our implementation.
+     *      If condition 1 cannot hold, we must make changes to the underlying gemm to:
+     *           1. Either expose isVarWeightsKernel() before gemm is configured somehow, or
+     *           2. Take in an additional "original_weights" tensor info at configure
+     */
     configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, act_info, enable_fast_math,
                  gemm_3d_depth, fixed_format, weights_info.weight_format());
 
+    // Can only decide isVarWeightsKernel after gemm is configured
+    _run_wt = !isVarWeightsKernel();
+
     if (!_skip_col2im && _data_layout == DataLayout::NCHW)
     {
         // Configure col2im
@@ -428,18 +575,27 @@ void CpuGemmConv2d::configure(const ITensorInfo         *src,
         _reshape->configure(gemm_output_to_use, dst);
     }
 
-    // Check if GEMM transforms weights
-    // Modernise through COMPMID-4535
-    bool gemm_trans_wei = _aux_mem[1].size > 0;                                            // Asm Pretranspose
-    gemm_trans_wei      = _mm_gemm != nullptr ? _aux_mem[3].size > 0 : gemm_trans_wei;     // Tranpose RHS
-    gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[5].size > 0 : gemm_trans_wei; // Transpose RHS
-
     // Check lifetime
     _aux_mem[Im2ColOutput] =
         MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size());
-    _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
-                                           gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
-                                           _weights_reshaped.total_size());
+    // Add WeightsReshaped memory requirement to workspace
+    // Note that in case of WeightTransformMethod::ReinterpretThenTranspose, we do not need to allocate this memory
+    // However since we cannot determine weight transformation method until prepare (see prepare()), we will have to
+    // settle with allocating more
+    if (_run_wt)
+    {
+        // Check if GEMM transforms weights
+        // If weight is further transformed by underlying gemm after ReshapeThenTranspose then we can free
+        // WeightsReshaped in prepare
+        // Otherwise WeightsReshaped is the final transformation of weights and needs to persist
+        bool gemm_trans_wei = _aux_mem[GemmAsmPretransposedRHS].size > 0;
+        gemm_trans_wei      = _mm_gemm != nullptr ? _aux_mem[GemmTransposed1xWRHS].size > 0 : gemm_trans_wei;
+        gemm_trans_wei      = _mm_gemmlowp != nullptr ? _aux_mem[GemmLowpTransposed1xWRHS].size > 0 : gemm_trans_wei;
+
+        _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped),
+                                               gemm_trans_wei ? MemoryLifetime::Prepare : MemoryLifetime::Persistent,
+                                               _weights_reshaped.total_size());
+    }
     _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size());
 }
 
@@ -471,10 +627,18 @@ Status CpuGemmConv2d::has_opt_impl(arm_compute::WeightFormat &expected_weight_fo
     const bool         skip_col2im   = skip_info.skip_col2im;
     const unsigned int gemm_3d_depth = skip_col2im ? conv_h : 0;
     const bool         fixed_format  = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
-    const GEMMInfo     gemm_info =
-        GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
-                 skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false, GEMMLowpOutputStageInfo(),
-                 false, enable_fast_math, false, act_info, fixed_format, weights_info.weight_format());
+
+    /** @section note_CpuGemmConv2d_weight_use_in_has_opt_impl Which weights tensor should we use for has_opt_impl
+     *
+     *  For the pretranspose_B flag, this shares a similar problem and thus the same decision as that of
+     *  @ref note_CpuGemmConv2d_weight_use_in_configure
+     *
+     *  But for the weights, we shall always use the original instead of reshaped weights here
+     */
+    const GEMMInfo gemm_info = GEMMInfo(false, false, true /* Reshape weights only for the first run */, gemm_3d_depth,
+                                        skip_im2col /* Reinterpret the input as 3D if im2col is skipped */, false,
+                                        GEMMLowpOutputStageInfo(), false, enable_fast_math, false, act_info,
+                                        fixed_format, weights_info.weight_format(), true /* pretranspose_B */);
 
     return CpuGemm::has_opt_impl(expected_weight_format, src, weights, biases, dst, gemm_info);
 }
@@ -565,8 +729,10 @@ Status CpuGemmConv2d::validate(const ITensorInfo         *src,
     unsigned int mat_weights_rows =
         weights->dimension(idx_width) * weights->dimension(idx_height) * weights->dimension(idx_channel);
 
-    weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, append_bias), 1, weights->data_type());
-    weights_reshaped_info.set_quantization_info(weights->quantization_info());
+    // Initialize reshaped weights
+    initialize_reshaped_weight_info(*weights, weights_reshaped_info);
+    // No need to call CpuReshape::validate() or CpuTranspose::validate() as the dst info is auto-configured from the
+    // src
     weights_to_use = &weights_reshaped_info;
 
     if (!skip_im2col)
@@ -613,6 +779,7 @@ Status CpuGemmConv2d::validate(const ITensorInfo         *src,
     gemm_output_to_use      = &info_gemm;
     const bool fixed_format = weights_info.weight_format() != arm_compute::WeightFormat::UNSPECIFIED;
 
+    // See note_CpuGemmConv2d_weight_use_in_configure regarding the choice of the weights
     ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, act_info,
                                             enable_fast_math, skip_col2im ? conv_h : 0, skip_im2col, fixed_format,
                                             weights_info.weight_format()));
@@ -637,7 +804,6 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
 
     CpuAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false);
     CpuAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false);
-    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false);
 
     bool out_has_padding = _skip_col2im && (dst->info()->padding().bottom != 0 || dst->info()->padding().top != 0);
     if (!_skip_im2col)
@@ -666,25 +832,32 @@ void CpuGemmConv2d::run(ITensorPack &tensors)
         gemm_output_to_use = dst;
     }
 
-    // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
-    ITensorPack pack_mm = tensors;
-    pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
-    if (!this->isVarWeightsKernel())
-    {
-        pack_mm.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
-    }
-    pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
-    if (_is_quantized)
+    ITensorPack gemm_pack = tensors;
+    gemm_pack.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use);
+    gemm_pack.add_tensor(TensorType::ACL_DST, gemm_output_to_use);
+    // Allocate reshaped weights if required
+    auto                weights = gemm_pack.get_const_tensor(TensorType::ACL_SRC_1);
+    CpuAuxTensorHandler reinterpreted_wei(
+        _weights_reshaped,
+        *weights); // Re-interpreted weights. Only tensor shape is changed. No allocation
+    CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
+    // Update the weights to use if it has been reshaped
+    if (_run_wt)
     {
-        // Run gemmlowp
-        _mm_gemmlowp->run(pack_mm);
-    }
-    else
-    {
-        // Run gemm
-        _mm_gemm->run(pack_mm);
+        if (_wt_method == WeightTransformMethod::ReinterpretThenTranspose)
+        {
+            gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
+        }
+        else if (_wt_method == WeightTransformMethod::ReshapeThenTranspose ||
+                 _wt_method == WeightTransformMethod::FusedReshapeAndTranspose)
+        {
+            gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+        }
     }
 
+    // Runs CpuGemm or CpuGemmLowpMatrixMultiplyCore functions
+    _is_quantized ? _mm_gemmlowp->run(gemm_pack) : _mm_gemm->run(gemm_pack);
+
     // Reshape output matrix
     if (!_skip_col2im)
     {
@@ -710,24 +883,87 @@ void CpuGemmConv2d::prepare(ITensorPack &tensors)
 {
     if (!_is_prepared)
     {
-        // Variable weights executions that use fixed-format kernels
-        // need no reshaping of the weights.
-        if (this->isVarWeightsKernel())
+        auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
+        // Determine which weights reshape path to take
+        // Note that this decision can only occur at prepare instead of configure because it relies on the presence of
+        // any holes in the weight tensor, which may change after configure (e.g. from extending padding)
+        if (_run_wt)
         {
-            _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors);
-            _is_prepared = true;
-            return;
+            _wt_method = get_wt_method(*(weights->info()));
+            switch (_wt_method)
+            {
+                case (WeightTransformMethod::FusedReshapeAndTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: FusedReshapeAndTranspose");
+                    _weights_reshape_and_transpose_kernel = std::make_unique<kernels::CpuWeightsReshapeKernel>();
+                    _weights_reshape_and_transpose_kernel->configure(weights->info(), nullptr, &_weights_reshaped);
+                    break;
+                }
+                case (WeightTransformMethod::ReshapeThenTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReshapeThenTranspose");
+                    _weights_reshape = std::make_unique<CpuReshape>();
+                    _weights_reshape->configure(weights->info(), &_weights_reshaped);
+                    break;
+                }
+                case (WeightTransformMethod::ReinterpretThenTranspose):
+                {
+                    ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Perform weight transformation: ReinterpretThenTranspose");
+                    // Nothing to configure
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported weight transform method");
+                }
+            }
+        }
+        else
+        {
+            ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("No weight transformation is performed");
         }
-
-        // Run weights reshaping and mark original weights tensor as unused
-        CpuAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
-        auto                weights = tensors.get_const_tensor(TensorType::ACL_SRC_1);
-        ITensorPack         pack    = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}};
-        NEScheduler::get().schedule_op(_weights_reshape_kernel.get(), 3, _weights_reshape_kernel->window(), pack);
-        weights->mark_as_unused();
         ITensorPack gemm_pack = tensors;
-        gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get());
+        // Allocate reshaped weights if required
+        CpuAuxTensorHandler reinterpreted_wei(
+            _weights_reshaped,
+            *weights); // Re-interpreted weights. Only tensor shape is changed. No allocation
+        CpuAuxTensorHandler reshaped_wei(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors);
+        // Run weights reshape if required
+        if (_run_wt)
+        {
+            switch (_wt_method)
+            {
+                case (WeightTransformMethod::FusedReshapeAndTranspose):
+                {
+                    ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}};
+                    NEScheduler::get().schedule_op(_weights_reshape_and_transpose_kernel.get(), Window::DimW,
+                                                   _weights_reshape_and_transpose_kernel->window(), pack);
+                    weights->mark_as_unused();
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+                    break;
+                }
+                case (WeightTransformMethod::ReshapeThenTranspose):
+                {
+                    ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, reshaped_wei.get()}};
+                    _weights_reshape->run(pack);
+                    weights->mark_as_unused();
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reshaped_wei.get());
+                    break;
+                }
+                case (WeightTransformMethod::ReinterpretThenTranspose):
+                {
+                    gemm_pack.add_const_tensor(TensorType::ACL_SRC_1, reinterpreted_wei.get());
+                    // Nothing to run
+                    break;
+                }
+                default:
+                {
+                    ARM_COMPUTE_ERROR("Unsupported weight transform method");
+                }
+            }
+        }
         _is_quantized ? _mm_gemmlowp->prepare(gemm_pack) : _mm_gemm->prepare(gemm_pack);
+
         _is_prepared = true;
     }
 }
diff --git a/src/cpu/operators/CpuGemmConv2d.h b/src/cpu/operators/CpuGemmConv2d.h
index 118d366517..48a0d11107 100644
--- a/src/cpu/operators/CpuGemmConv2d.h
+++ b/src/cpu/operators/CpuGemmConv2d.h
@@ -42,21 +42,12 @@ class CpuGemmLowpOutputStage;
 class CpuReshape;
 namespace kernels
 {
-class CpuWeightsReshapeKernel;
 class CpuIm2ColKernel;
 class CpuCol2ImKernel;
+class CpuWeightsReshapeKernel;
 } // namespace kernels
 
-/** Basic function to compute the convolution layer. This function calls the following kernels/functions:
- *
- * -# @ref cpu::kernels::CpuIm2ColKernel
- * -# @ref CpuGemm (if the data type is BFLOAT16/FP16/FP32)
- * -# @ref CpuGemmLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref CpuGemmLowpOutputStage (if the data type is QASYMM8/QASYMM8_SIGNED)
- * -# @ref cpu::kernels::CpuCol2ImKernel (if NCHW data layout)
- * -# @ref kernels::CpuWeightsReshapeKernel
- *
- */
+/** Basic function to compute the convolution layer. @ref note_CpuGemmConv2d_weight_transformation */
 class CpuGemmConv2d : public ICpuOperator
 {
 public:
@@ -99,7 +90,7 @@ public:
      * @param[out] dst              Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs.
      *                              Data types supported: Same as @p input.
      * @param[in]  conv_info        Contains padding and stride information described in @ref PadStrideInfo.
-     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with NEWeightsReshapeKernel. If this is not part of the fully connected layer the weights
+     * @param[in]  weights_info     Specifies if the weights tensor has been reshaped with CpuWeightsReshapeKernel. If this is not part of the fully connected layer the weights
      *                              tensor has also been transposed with cpu::kernels::CpuGemmTranspose1xWKernel. Data type supported: Same as @p input.
      * @param[in]  dilation         (Optional) Dilation, in elements, across x and y. Defaults to (1, 1).
      * @param[in]  act_info         (Optional) Activation layer information in case of a fused activation. Only RELU, BOUNDED_RELU and LU_BOUNDED_RELU supported.
@@ -136,7 +127,7 @@ public:
 
     /** Indicates whether or not there is an optimal assembly implementation that can be used to process the given parameters.
      *
-     * The paramter list is the same as @ref NEGEMMConvolutionLayer::has_opt_impl
+     * The parameter list is the same as @ref NEGEMMConvolutionLayer::has_opt_impl
      *
      * @return a status.
      */
@@ -254,15 +245,35 @@ private:
     bool isVarWeightsKernel() const;
     enum AuxTensorIdx
     {
-        // CpuGemmLowpMatrixMultiplyCore has up to 8 internal tensors
-        Im2ColOutput = 9,
+        GemmAsmPretransposedRHS  = 2, // CpuGemmAssemblyDispatch::Pretranspose
+        GemmTransposed1xWRHS     = 5, // CpuGemm::Transposed1xWRHS
+        GemmLowpTransposed1xWRHS = 6, // CpuGemmLowpMatrixMultiplyCore::TmpB
+        /* Slots 0 - 9 reserved and shared by CpuGemmLowpMatrixMultiplyCore and CpuGemm */
+        Im2ColOutput = 10,
         WeightsReshaped,
         GemmOutput,
         Count
     };
 
-    std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_kernel;
-    std::unique_ptr<cpu::kernels::CpuIm2ColKernel>    _im2col_kernel;
+    /** Weight transformation method. See @ref note_CpuGemmConv2d_weight_transformation */
+    enum class WeightTransformMethod
+    {
+        ReinterpretThenTranspose,
+        ReshapeThenTranspose,
+        FusedReshapeAndTranspose,
+    };
+
+    /** Select weight transformation method
+     *
+     * @param[in] weights Input weights
+     *
+     * @return WeightTransformMethod
+     */
+    static WeightTransformMethod get_wt_method(const ITensorInfo &weights);
+
+    std::unique_ptr<CpuReshape>                       _weights_reshape;
+    std::unique_ptr<kernels::CpuWeightsReshapeKernel> _weights_reshape_and_transpose_kernel;
+    std::unique_ptr<kernels::CpuIm2ColKernel>         _im2col_kernel;
     std::unique_ptr<CpuGemm>                          _mm_gemm;
     std::unique_ptr<CpuGemmLowpMatrixMultiplyCore>    _mm_gemmlowp;
     std::unique_ptr<kernels::CpuCol2ImKernel>         _col2im_kernel;
@@ -275,10 +286,12 @@ private:
 
     DataLayout _data_layout;
 
-    bool _skip_im2col;
-    bool _skip_col2im;
-    bool _is_quantized;
-    bool _is_prepared;
+    bool                  _skip_im2col;
+    bool                  _skip_col2im;
+    bool                  _is_quantized;
+    bool                  _is_prepared;
+    WeightTransformMethod _wt_method;
+    bool                  _run_wt;
 
     experimental::MemoryRequirements _aux_mem{Count};
 };
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.cpp b/src/cpu/operators/CpuGemmDirectConv2d.cpp
index 8fa81b1907..9187927541 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.cpp
+++ b/src/cpu/operators/CpuGemmDirectConv2d.cpp
@@ -140,9 +140,11 @@ void CpuGemmDirectConv2d::configure(const ITensorInfo *src,
     }
 
     // Add auxiliary memory requirements of the assembly dispatch
-    auto asm_mem_req           = _gemm_asm_func->workspace();
-    _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-    _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
+    const auto asm_mem_req = _gemm_asm_func->workspace();
+    for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+    {
+        _aux_mem[slot] = asm_mem_req[slot];
+    }
 
     if (_aux_mem[Pretranspose].size > 0)
     {
diff --git a/src/cpu/operators/CpuGemmDirectConv2d.h b/src/cpu/operators/CpuGemmDirectConv2d.h
index 1cc3caadae..a7365615b9 100644
--- a/src/cpu/operators/CpuGemmDirectConv2d.h
+++ b/src/cpu/operators/CpuGemmDirectConv2d.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
-#define ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
 
 #include "arm_compute/core/TensorInfo.h"
 
@@ -95,8 +95,10 @@ public:
 private:
     enum AuxTensorIdx
     {
-        AsmGemmWorkspace = 0,
+        GemmTemp0 = 0,
+        GemmTemp1,
         Pretranspose,
+        /* Slots above (0-2) are reserved for CpuGemmAssemblyDispatch */
         PermutedWeights,
         Count
     };
@@ -112,4 +114,4 @@ private:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ARM_COMPUTE_CPU_GEMM_DIRECT_CONV_2D_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMDIRECTCONV2D_H
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
index 2ee879b67b..b25505a85d 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.cpp
@@ -297,9 +297,11 @@ void CpuGemmLowpMatrixMultiplyCore::configure(
 
     if (_assembly_path)
     {
-        auto asm_mem_req           = _asm_glue->workspace();
-        _aux_mem[AsmGemmWorkspace] = asm_mem_req[AsmGemmWorkspace];
-        _aux_mem[Pretranspose]     = asm_mem_req[Pretranspose];
+        const auto asm_mem_req = _asm_glue->workspace();
+        for (unsigned int slot = 0; slot < asm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = asm_mem_req[slot];
+        }
     }
 
     // Request memory for LHS and RHS reshape matrix
diff --git a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
index a7798938e7..78065a8953 100644
--- a/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
+++ b/src/cpu/operators/CpuGemmLowpMatrixMultiplyCore.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
-#define ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H
+#ifndef ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
+#define ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
 
 #include "arm_compute/core/TensorInfo.h"
 #include "arm_compute/function_info/GEMMInfo.h"
@@ -134,9 +134,8 @@ public:
 private:
     enum AuxTensorIdx
     {
-        AsmGemmWorkspace = 0,
-        Pretranspose,
-        VectorSumCol,
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        VectorSumCol = 3,
         VectorSumRow,
         TmpA,
         TmpB,
@@ -181,4 +180,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /*ARM_COMPUTE_CPU_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
+#endif // ACL_SRC_CPU_OPERATORS_CPUGEMMLOWPMATRIXMULTIPLYCORE_H
diff --git a/src/cpu/operators/CpuMatMul.h b/src/cpu/operators/CpuMatMul.h
index 24db3da346..2b1b4cf0ff 100644
--- a/src/cpu/operators/CpuMatMul.h
+++ b/src/cpu/operators/CpuMatMul.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL
-#define ACL_SRC_CPU_OPERATORS_CPUMATMUL
+#ifndef ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
+#define ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
 
 #include "arm_compute/core/TensorInfo.h"
 
@@ -93,9 +93,8 @@ public:
 private:
     enum InternalTensorIdx
     {
-        AsmGemmWorkspace = 0, // Pre-allocate workspace tensors for CpuGemmAssemblyDispatch
-        PretransposeRHS,      // Pre-allocate workspace tensors for CpuGemmAssemblyDispatch
-        TransposeLHS,
+        /* Slots 0 - 2 reserved for CpuGemmAssemblyDispatch */
+        TransposeLHS = 3,
         TransposeRHS,
         Count
     };
@@ -124,4 +123,4 @@ private:
 } // namespace cpu
 } // namespace arm_compute
 
-#endif /* ACL_SRC_CPU_OPERATORS_CPUMATMUL */
+#endif // ACL_SRC_CPU_OPERATORS_CPUMATMUL_H
diff --git a/src/cpu/operators/CpuWinogradConv2d.cpp b/src/cpu/operators/CpuWinogradConv2d.cpp
index 9d07736c13..e4bcdc0b64 100644
--- a/src/cpu/operators/CpuWinogradConv2d.cpp
+++ b/src/cpu/operators/CpuWinogradConv2d.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021-2022 Arm Limited.
+ * Copyright (c) 2021-2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -294,12 +294,11 @@ void CpuWinogradConv2d::configure(const ITensorInfo         *src,
             _activation_func->configure(dst, nullptr, act_info);
         }
 
-        auto asm_mem_req         = _gemm_function->workspace();
-        _aux_mem[GemmWorkspace]  = asm_mem_req[GemmWorkspace];
-        _aux_mem[Pretranspose]   = asm_mem_req[Pretranspose];
-        _aux_mem[InterleavedLHS] = asm_mem_req[InterleavedLHS];
-        _aux_mem[TransposedRHS]  = asm_mem_req[TransposedRHS];
-        _aux_mem[TempResult]     = asm_mem_req[TempResult];
+        const auto mm_mem_req = _gemm_function->workspace();
+        for (unsigned int slot = 0; slot < mm_mem_req.size(); ++slot)
+        {
+            _aux_mem[slot] = mm_mem_req[slot];
+        }
 
         // Request temporary memory. Overlap memory needed for Input/Output transformations as they run on different non-overlapping time-steps.
         _aux_mem[TransformedInput]  = MemoryInfo(offset_int_vec(TransformedInput), MemoryLifetime::Temporary,
diff --git a/src/cpu/operators/CpuWinogradConv2d.h b/src/cpu/operators/CpuWinogradConv2d.h
index ba9b879431..03bfc51a46 100644
--- a/src/cpu/operators/CpuWinogradConv2d.h
+++ b/src/cpu/operators/CpuWinogradConv2d.h
@@ -29,8 +29,8 @@
 
 #include "src/core/common/Macros.h"
 #include "src/cpu/ICpuOperator.h"
-#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/kernels/assembly/gemm_common.hpp"
+#include "src/cpu/kernels/CpuWinogradConv2dKernel.h"
 #include "src/cpu/operators/CpuActivation.h"
 #include "src/cpu/operators/CpuGemm.h"
 #include "src/cpu/operators/CpuPermute.h"
@@ -96,26 +96,22 @@ public:
                            bool                       enable_fast_math = false);
 
     // Inherited methods overridden:
-    void run(ITensorPack &tensors) override;
-    void prepare(ITensorPack &constants) override;
+    void                             run(ITensorPack &tensors) override;
+    void                             prepare(ITensorPack &constants) override;
     experimental::MemoryRequirements workspace() const override;
 
 private:
     enum AuxTensorIdx
     {
-        GemmWorkspace      = 0,
-        Pretranspose       = 1,
-        InterleavedLHS     = 2,
-        TransposedRHS      = 3,
-        TempResult         = 4,
-        TransformedInput   = 5,
-        TransformedOutput  = 6,
-        WorkspaceIO        = 7,
-        TransformedWeights = 8,
-        PermutedWeights    = 9,
-        PermutedInput      = TransformedOutput,
-        PermutedOutput     = TransformedInput,
-        Count              = 10
+        /** Slot 0 - 6 reserved for CpuGemm */
+        TransformedInput = 7,
+        TransformedOutput,
+        WorkspaceIO,
+        TransformedWeights,
+        PermutedWeights,
+        Count,
+        PermutedInput  = TransformedOutput,
+        PermutedOutput = TransformedInput
     };
     std::unique_ptr<CpuGemm>         _gemm_function;
     std::unique_ptr<CpuActivation>   _activation_func;
@@ -124,9 +120,9 @@ private:
     std::unique_ptr<CpuPermute>      _permute_input;
     std::unique_ptr<CpuPermute>      _permute_output;
     std::unique_ptr<CpuPermute>      _permute_weights;
-    experimental::MemoryRequirements _aux_mem{ Count };
+    experimental::MemoryRequirements _aux_mem{Count};
     std::unique_ptr<arm_conv::ConvolutionArgs>
-    _conv_args; // Make it unique ptr because this type does not have a default constructor
+        _conv_args; // Make it unique ptr because this type does not have a default constructor
     arm_conv::winograd::WinogradImpl _winograd_impl;
     DataLayout                       _data_layout;
     TensorInfo                       _winograd_transformed_input;
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
index 343ef21c0b..82bd465c99 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.cpp
@@ -31,6 +31,7 @@
 #include "src/core/utils/AssemblyUtils.h"
 #include "src/cpu/kernels/assembly/arm_gemm.hpp"
 #include "src/cpu/kernels/assembly/CpuGemmAssemblyWrapperKernel.h"
+#include "src/cpu/operators/CpuTranspose.h"
 #include "src/cpu/utils/CpuAuxTensorHandler.h"
 
 #include <arm_neon.h>
@@ -229,6 +230,7 @@ private:
     enum AuxTensorIdx
     {
         AsmGemmWorkspace = 0,
+        PrePretransposedB, /* Transposed B (rhs) before being passed to gemm or pretranspose_B_array */
         Pretranspose,
         Count
     };
@@ -244,12 +246,16 @@ private:
     /** Prepare the indirect buffer */
     void prepare_indirect_buffer(ITensorPack &tensors);
 
+    /** Operator to transpose B before gemm or pretranspose_B_array*/
+    std::unique_ptr<CpuTranspose> _pre_pretranspose_b{nullptr};
     /** Assembly Gemm kernel */
     std::shared_ptr<arm_gemm::GemmCommon<TypeInput, TypeOutput>> _gemm_kernel_asm{nullptr};
     /** Optimised Arm® Neon™ kernel */
     std::unique_ptr<INEKernel> _optimised_kernel{nullptr};
     /** Assembly GEMM workspace tensor info */
     TensorInfo _workspace_info{};
+    /** Pre-pre-transposed B tensor info */
+    TensorInfo _pre_pretransposed_b_info{};
     /** Pre-transpose tensor info */
     TensorInfo _pretranspose_info{};
     /** Prepared flag */
@@ -473,9 +479,45 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensorInfo *
 
     _optimised_kernel = std::move(acl_gemm_wrapper);
     _gemm_info        = gemm_info;
+    // Check if we need to pre-pretranspose B. Fixed format kernels need no pre-pretranspose.
+    const bool run_pre_pretranspose_b = _gemm_info.transpose_b && !isVarWeightsKernel();
+    if (run_pre_pretranspose_b)
+    {
+        _pre_pretranspose_b = std::make_unique<CpuTranspose>();
+        _pre_pretranspose_b->configure(b, &_pre_pretransposed_b_info);
+        MemoryLifetime lifetime;
+        if (_is_b_constant)
+        {
+            if (_gemm_kernel_asm->B_pretranspose_required())
+            {
+                // PrePretransposedB tensor is only used in prepare(), but is then succeeded by Pretranspose
+                // So PrePretransposedB can be freed inside prepare()
+                lifetime = MemoryLifetime::Prepare;
+            }
+            else
+            {
+                // PrePretransposedB tensor is only used in prepare(), but is the final transformation of B
+                // So PrePretransposedB needs to persist beyond prepare()
+                lifetime = MemoryLifetime::Persistent;
+            }
+        }
+        else
+        {
+            // PrePretransposedB tensor is always used in run() and doesn't need to persist
+            lifetime = MemoryLifetime::Temporary;
+        }
+        // Forcing 128-byte alignment (required by 32-bit kernels)
+        const unsigned int alignment = 128;
+        _aux_mem[PrePretransposedB] =
+            MemoryInfo(offset_int_vec(PrePretransposedB), lifetime, _pre_pretransposed_b_info.total_size(), alignment);
+    }
+
     // Check for pre-transposed support
     if (_gemm_kernel_asm->B_pretranspose_required())
     {
+        // Fixed format kernels need no pretranspose.
+        ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+            assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
         // Forcing 128-byte alignment (required by 32-bit kernels)
         const unsigned int alignment           = 128;
         const size_t       B_pretranspose_size = _gemm_kernel_asm->get_B_pretransposed_array_size();
@@ -506,6 +548,22 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
             _gemm_kernel_asm->set_quantized_bias(
                 reinterpret_cast<const int32_t *>(c->buffer() + c->info()->offset_first_element_in_bytes()), 0);
         }
+        const ITensor *b_to_use = b;
+        // Pre-pretranspose B if required
+        const bool          run_pre_pretranspose_b = _gemm_info.transpose_b && !isVarWeightsKernel();
+        CpuAuxTensorHandler pre_pretransposed_b(
+            offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors,
+            /*pack_inject: no need to inject into tensors*/
+            false,
+            /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/
+            !run_pre_pretranspose_b);
+        if (run_pre_pretranspose_b)
+        {
+            ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr);
+            ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}};
+            _pre_pretranspose_b->run(pre_pretranspose_pack);
+            b_to_use = pre_pretransposed_b.get();
+        }
 
         // Pretranspose B if required
         if (_gemm_kernel_asm->B_pretranspose_required())
@@ -513,10 +571,10 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
             // Fixed format kernels need no pretranspose.
             ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
                 assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
-            const int  ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto in1_ptr =
-                reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            const int  ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+            const auto in1_ptr        = reinterpret_cast<const TypeInput *>(b_to_use->buffer() +
+                                                                     b_to_use->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, false);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
@@ -525,6 +583,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::prepare(ITensorPack &tensors)
                                                                      NEScheduler::get().num_threads());
 
             b->mark_as_unused();
+            // Note that we don't need to mark b_to_use as unused, as if it's been assigned to pre_pretransposed_b, its memory will be auto-managed by the handler
         }
 
         if (_gemm_info.method == AsmConvMethod::Indirect)
@@ -576,16 +635,33 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
     const TypeInput *in1_ptr = nullptr;
     auto             out_ptr = reinterpret_cast<TypeOutput *>(d->buffer() + d->info()->offset_first_element_in_bytes());
 
+    const ITensor *b_to_use = b;
+
+    // Pre-pretranspose B if required
+    const bool          run_pre_pretranspose_b = _gemm_info.transpose_b && !isVarWeightsKernel();
+    CpuAuxTensorHandler pre_pretransposed_b(
+        offset_int_vec(PrePretransposedB), _pre_pretransposed_b_info, tensors,
+        false /*pack_inject: no need to inject into tensors*/,
+        !run_pre_pretranspose_b /*bypass_alloc: no need to allocate if pre-pretranspose B is not required as this handle will not be used*/);
+    if (b_to_use && !_is_b_constant && run_pre_pretranspose_b)
+    {
+        ARM_COMPUTE_ERROR_ON(_pre_pretranspose_b == nullptr);
+        ITensorPack pre_pretranspose_pack{{ACL_SRC, b_to_use}, {ACL_DST, pre_pretransposed_b.get()}};
+        _pre_pretranspose_b->run(pre_pretranspose_pack);
+        b_to_use = pre_pretransposed_b.get();
+    }
+
     // Check if B is pre-tranposed and de-reference if not
     if (!_gemm_kernel_asm->B_is_pretransposed())
     {
-        ldb            = b->info()->strides_in_bytes().y() / b->info()->element_size();
-        multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
-        in1_ptr        = reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
+        ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+        multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
+        in1_ptr =
+            reinterpret_cast<const TypeInput *>(b_to_use->buffer() + b_to_use->info()->offset_first_element_in_bytes());
     }
 
     // If necessary, run pretranspose every time if either weights or biases are non-constant
-    if ((b && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
+    if ((b_to_use && !_is_b_constant) || (c && !_is_c_constant && c->info()->data_type() == DataType::S32))
     {
         if (c && c->info()->data_type() == DataType::S32)
         {
@@ -596,10 +672,13 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run(ITensorPack &tensors)
         // Pretranspose B if required
         if (_B_pretranspose_required)
         {
-            const int  ldb = b->info()->strides_in_bytes().y() / b->info()->element_size();
-            const auto b_ptr =
-                reinterpret_cast<const TypeInput *>(b->buffer() + b->info()->offset_first_element_in_bytes());
-            const int multi_stride_b = b->info()->strides_in_bytes().z() / b->info()->element_size();
+            // Fixed format kernels need no pretranspose.
+            ARM_COMPUTE_ERROR_ON(arm_compute::is_fixed_format(
+                assembly_utils::map_to_arm_compute_weight_format(_gemm_kernel_asm->get_config().weight_format)));
+            const int  ldb            = b_to_use->info()->strides_in_bytes().y() / b_to_use->info()->element_size();
+            const auto b_ptr          = reinterpret_cast<const TypeInput *>(b_to_use->buffer() +
+                                                                   b_to_use->info()->offset_first_element_in_bytes());
+            const int  multi_stride_b = b_to_use->info()->strides_in_bytes().z() / b_to_use->info()->element_size();
 
             CpuAuxTensorHandler pretranspose(offset_int_vec(Pretranspose), _pretranspose_info, tensors, true);
             ARM_COMPUTE_ERROR_ON(pretranspose.get()->buffer() == nullptr);
@@ -762,6 +841,7 @@ Status CpuGemmAssemblyDispatch::has_opt_impl(arm_compute::WeightFormat &expected
     arm_gemm::WeightFormat arm_gemm_expected_wf = assembly_utils::map_to_arm_gemm_weight_format(expected_weight_format);
     arm_gemm::GemmArgs     args(&ci, p.M, p.N, p.K, p.sections, p.batches, p.multis, p.indirect, act, num_threads,
                                 info.fixed_format, info.fast_mode, &cfg);
+    // TODO: Incorporate info.transpose_b COMPMID-6595
     switch (a->data_type())
     {
         case DataType::F32:
diff --git a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
index 5be39a54c0..671a222fed 100644
--- a/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
+++ b/src/cpu/operators/internal/CpuGemmAssemblyDispatch.h
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
-#define ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H
+#ifndef ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
+#define ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
 
 #include "arm_compute/function_info/ActivationLayerInfo.h"
 
@@ -57,6 +57,13 @@ struct AsmGemmInfo
     bool                      fixed_format{false};
     arm_compute::WeightFormat weight_format{arm_compute::WeightFormat::UNSPECIFIED};
     bool                      reshape_b_only_on_first_run{true};
+    /** Whether we want to perform an additional transpose of b before passing it to gemm or pretranspose_B_array
+     * @note This transpose b operation is also considered a form of "reshape" or "transform", so should be counted for
+     *       by the reshape_b_only_on_first_run flag
+     * @note This flag will be silently ignored (assumed to be false) when the weight_format is a fixed format. Because
+     *       fixed format kernels do not accept weights (B) with any prior transformations
+     */
+    bool transpose_b{false};
 };
 
 /** Assembly kernel glue */
@@ -187,4 +194,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_INTERNAL_CPU_GEMM_ASSEMBLY_DISPATCH_H */
+#endif // ACL_SRC_CPU_OPERATORS_INTERNAL_CPUGEMMASSEMBLYDISPATCH_H
diff --git a/src/cpu/utils/CpuAuxTensorHandler.h b/src/cpu/utils/CpuAuxTensorHandler.h
index e23b88a777..627216837b 100644
--- a/src/cpu/utils/CpuAuxTensorHandler.h
+++ b/src/cpu/utils/CpuAuxTensorHandler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2021 Arm Limited.
+ * Copyright (c) 2021, 2023 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -21,8 +21,8 @@
  * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
  * SOFTWARE.
  */
-#ifndef ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
-#define ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H
+#ifndef ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H
+#define ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H
 
 #include "arm_compute/core/ITensorPack.h"
 #include "arm_compute/core/TensorInfo.h"
@@ -71,7 +71,13 @@ public:
         }
     }
 
-    CpuAuxTensorHandler(TensorInfo &info, ITensor &tensor) : _tensor()
+    /** Create a temporary handle to the original tensor with a new @ref TensorInfo
+     * This is useful if we want to change a tensor's tensor info at run time without modifying the original tensor
+     *
+     * @param[in] info   New tensor info to "assign" to @p tensor
+     * @param[in] tensor Tensor to be assigned a new @ref TensorInfo
+     */
+    CpuAuxTensorHandler(TensorInfo &info, const ITensor &tensor) : _tensor()
     {
         _tensor.allocator()->soft_init(info);
         if (info.total_size() <= tensor.info()->total_size())
@@ -108,4 +114,4 @@ private:
 };
 } // namespace cpu
 } // namespace arm_compute
-#endif /* ARM_COMPUTE_CPU_UTILS_CPU_AUX_TENSOR_HANDLER_H */
+#endif // ACL_SRC_CPU_UTILS_CPUAUXTENSORHANDLER_H