20 files changed, 1518 insertions, 101 deletions
diff --git a/arm_compute/core/CPP/ICPPKernel.h b/arm_compute/core/CPP/ICPPKernel.h
index f41567ee11..ec05af20bd 100644
--- a/arm_compute/core/CPP/ICPPKernel.h
+++ b/arm_compute/core/CPP/ICPPKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -49,7 +49,25 @@ public:
      * @param[in] window Region on which to execute the kernel. (Must be a region of the window returned by window())
      * @param[in] info   Info about executing thread and CPU.
      */
-    virtual void run(const Window &window, const ThreadInfo &info) = 0;
+    virtual void run(const Window &window, const ThreadInfo &info)
+    {
+        ARM_COMPUTE_UNUSED(window);
+        ARM_COMPUTE_UNUSED(info);
+        ARM_COMPUTE_ERROR("default implementation of legacy run() virtual member function invoked");
+    }
+
+    /** legacy compatibility layer for implemantions which do not support thread_locator
+     * In these cases we simply narrow the interface down the legacy version
+     *
+     * @param[in] window         Region on which to execute the kernel. (Must be a region of the window returned by window())
+     * @param[in] info           Info about executing thread and CPU.
+     * @param[in] thread_locator Specifies "where" the current thread is in the multi-dimensional space
+     */
+    virtual void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator)
+    {
+        ARM_COMPUTE_UNUSED(thread_locator);
+        run(window, info);
+    }
 
     /** Name of the kernel
      *
diff --git a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
index d612681c41..0e3dd74577 100644
--- a/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
+++ b/arm_compute/core/NEON/kernels/assembly/NEGEMMAssemblyWrapperKernel.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -24,6 +24,7 @@
 #ifndef ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 #define ARM_COMPUTE_ASSEMBLY_GEMM_KERNEL_WRAPPER_KERNEL_H
 
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
 #include "arm_compute/core/NEON/INEKernel.h"
 #include "arm_compute/core/Utils.h"
 #include "arm_compute/core/Validate.h"
@@ -65,15 +66,33 @@ public:
     {
         return _name.c_str();
     }
-    // Inherited methods overridden:
+
+
     void run(const Window &window, const ThreadInfo &info) override
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
         ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
-        auto first = window.x().start();
-        auto last  = window.x().end();
-        _kernel->execute(first, last, info.thread_id);
+
+        auto win=arm_gemm::to_ndcoord(window);
+
+        arm_gemm::ndcoord_t thread_locator { };
+
+        _kernel->execute(win, thread_locator, info.thread_id);
     }
+
+    // Inherited methods overridden:
+    void run_nd(const Window &window, const ThreadInfo &info, const Window &thread_locator) override
+    {
+        ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(_kernel)));
+        ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+
+        //convert between arm_compute and arm_gemm types
+        auto ndc_win = arm_gemm::to_ndcoord(window);
+        auto ndc_tlc = arm_gemm::to_ndcoord(thread_locator);
+
+        _kernel->execute(ndc_win, ndc_tlc, info.thread_id);
+    }
+
     /** Initialise the kernel's input and output.
      *
      * @param[in] kernel      Pointer to an assembly kernel implementation.
@@ -83,9 +102,9 @@ public:
     {
         ARM_COMPUTE_ERROR_ON_NULLPTR((reinterpret_cast<void *>(kernel)));
         _kernel         = kernel;
-        auto   win_last = _kernel->get_window_size();
-        Window win;
-        win.set(Window::DimX, Window::Dimension(0, win_last, 1));
+
+        Window win = to_window(kernel->get_window_size());
+
         INEKernel::configure(win);
 
         if(!kernel_name_tag.empty())
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
index e89523981d..7723224ec8 100644
--- a/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2018-2019 ARM Limited.
+ * Copyright (c) 2018-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -40,6 +40,7 @@ enum class GemmMethod
     GEMM_NATIVE,
     GEMM_HYBRID,
     GEMM_INTERLEAVED,
+    GEMM_INTERLEAVED_2D,
     QUANTIZE_WRAPPER,
     GEMM_HYBRID_QUANTIZED
 };
diff --git a/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
new file mode 100644
index 0000000000..7dff01003d
--- /dev/null
+++ b/arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_compute/core/Window.h"
+#include "arm_compute/core/Dimensions.h"
+#include "src/core/NEON/kernels/arm_gemm/ndrange.hpp"
+
+#include <cassert>
+
+/* This file contains mapping between integral types used in arm_compute and arm_gemm
+ * These two codebases both require a degree of separation for the sake of modularity
+ * so maintain their own types which represent similar information.
+ */
+
+namespace arm_gemm {
+
+//we want to unify the maximum number of dimensions used beween arm_gemm and arm compute library
+constexpr std::size_t ndrange_max =
+    arm_compute::Dimensions<unsigned int>::num_max_dimensions;
+
+using ndrange_t=NDRange<ndrange_max>;
+using ndcoord_t=NDCoordinate<ndrange_max>;
+
+/* Converts an `arm_gemm::ndrange_t` to a `arm_compute::Window`
+ *
+ * As `NDRange<T>` does not not encode start positions, we specify
+ * the start to be zero in the produced `arm_compute::Window`
+ *
+ * @param [ndr] the `arm_gemm::ndrange_t` we wish to convert into a `arm_compute::Window`
+ * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndr`
+ */
+inline arm_compute::Window to_window(const ndrange_t& ndr) {
+    arm_compute::Window win;
+
+    for(unsigned int i = 0; i!=ndrange_max; ++i) {
+        //populate the window with the dimensions of the NDRange
+        win.set(i, arm_compute::Window::Dimension(0, ndr.get_size(i)));
+    }
+
+    return win;
+}
+
+/*
+ * Converts an `arm_gemm::ndcoord_t` to a `arm_compute::Window`
+ *
+ * @param [ndc] the `arm_gemm::ndcoord_t` we wish to convert into a `arm_compute::Window`
+ * @returns an `arm_compute::Window` representing the same dimensional ranges as `ndc`
+ */
+inline arm_compute::Window to_window(const ndcoord_t& ndc) {
+    arm_compute::Window win;
+
+    for(unsigned int i = 0; i!=ndrange_max; ++i) {
+        const auto start = ndc.get_position(i);
+        const auto size  = ndc.get_size(i);
+        const auto stop  = start + size;
+
+        //populate the window with the dimensions of the NDRange
+        win.set(i, arm_compute::Window::Dimension(start, stop));
+    }
+
+    return win;
+}
+
+/** Convert an `arm_compute::Window` to an `arm_gemm::NDRange` of the same max dimensions
+ *
+ * It should be noted that `arm_compute::Window` specifies a `start()` and an `end()`
+ * where as `arm_gemm::ndrange_t` only has a size, as a result we store the delta between the range
+ *
+ * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndrange_t`
+ * @return the resultant ndrange_t
+ */
+inline ndrange_t to_ndrange(const arm_compute::Window& win) {
+    return {
+        static_cast<unsigned int>(win[0].end() - win[0].start()),
+        static_cast<unsigned int>(win[1].end() - win[1].start()),
+        static_cast<unsigned int>(win[2].end() - win[2].start()),
+        static_cast<unsigned int>(win[3].end() - win[3].start()),
+        static_cast<unsigned int>(win[4].end() - win[4].start()),
+        static_cast<unsigned int>(win[5].end() - win[5].start())
+    };
+}
+
+/** Convert an `arm_compute::Window` to an `arm_gemm::NDCoord` of the same max dimensions
+ *
+ * @param [win] the `arm_compute::Window` we want to convert to `arm_gemm::ndcoord_t`
+ * @return the resultant ndcoord_t
+ */
+inline ndcoord_t to_ndcoord(const arm_compute::Window& win) {
+    return {
+        { static_cast<unsigned int>(win[0].start()), static_cast<unsigned int>(win[0].end() - win[0].start()) },
+        { static_cast<unsigned int>(win[1].start()), static_cast<unsigned int>(win[1].end() - win[1].start()) },
+        { static_cast<unsigned int>(win[2].start()), static_cast<unsigned int>(win[2].end() - win[2].start()) },
+        { static_cast<unsigned int>(win[3].start()), static_cast<unsigned int>(win[3].end() - win[3].start()) },
+        { static_cast<unsigned int>(win[4].start()), static_cast<unsigned int>(win[4].end() - win[4].start()) },
+        { static_cast<unsigned int>(win[5].start()), static_cast<unsigned int>(win[5].end() - win[5].start()) }
+    };
+}
+
+} //namespace arm_gemm
diff --git a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
index d17fd5fe97..ea9b524e15 100644
--- a/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/arm_compute/core/NEON/kernels/assembly/gemm_common.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,7 +23,10 @@
  */
 #pragma once
 
+#include "arm_compute/core/NEON/kernels/assembly/arm_gemm_compute_iface.hpp"
+
 #include <cstddef>
+#include <cassert>
 
 #define UNUSED(x)   (void)(x)
 
@@ -51,10 +54,10 @@ public:
                                           void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
                                     const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) = 0;
 
-    /* For threading, we divide the work into some number of units and work
-     * out internally what unit corresponds to what work.  This returns the
-     * total number of units.  */
-    virtual unsigned int get_window_size() const = 0;
+    /** @returns an ndrange containing ranges of the compute space which can be
+     * broken up and parallelised over
+     */
+    virtual ndrange_t get_window_size() const = 0;
 
     /* The maximum thread count is specified when the GEMM is created.  Some
      * implementations need to know how many threads will actually run in
@@ -73,9 +76,12 @@ public:
     /* Whether this GEMM can be dynamically scheduled or not. */
     virtual bool supports_dynamic_scheduling() const { return false; }
 
-    /* Actually do the work.  Provide a threadid to index any per-thread
-     * buffers, and a start/end range to indicate which work to do.  */
-    virtual void execute(unsigned int, unsigned int, int) = 0;
+    /** Main execute member fucntion
+     * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
+     * @param [in] thread_locator where are we inside of the thread space
+     * @naram [in] threadid       a unique threadid
+     */
+    virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0;
 
     /*** Working space interface (optional) ***/
     /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
@@ -108,8 +114,7 @@ public:
     virtual ~IGemmCommon() { }
 };
 
-/*
- * "Real" GemmCommon class which is templated on the operand and return types.
+/* "Real" GemmCommon class which is templated on the operand and return types.
  *
  * In addition to correctly typed versions of the functions that operate on
  * operand and return data, this class provides a default implementation of
@@ -178,4 +183,19 @@ public:
     }
 };
 
+template<typename GemmKernel>
+inline
+int unsigned get_total_window_size(const GemmKernel& kernel)
+{
+    auto window=kernel.get_window_size();
+
+    unsigned int total = 1;
+    for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i)
+    {
+        total *= window.get_size(i);
+    }
+
+    return total;
+}
+
 } // namespace arm_gemm
diff --git a/arm_compute/runtime/IScheduler.h b/arm_compute/runtime/IScheduler.h
index f68294016a..a5e20ee627 100644
--- a/arm_compute/runtime/IScheduler.h
+++ b/arm_compute/runtime/IScheduler.h
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -27,6 +27,7 @@
 #include "arm_compute/core/CPP/CPPTypes.h"
 
 #include <functional>
+#include <limits>
 
 namespace arm_compute
 {
@@ -42,6 +43,13 @@ public:
         STATIC,  /**< Split the workload evenly among the threads */
         DYNAMIC, /**< Split the workload dynamically using a bucket system */
     };
+
+    /** When arm_compute::ISchedular::Hints::_split_dimension is initialized with this value
+     * then the schedular is free to break down the problem space over as many dimensions
+     * as it wishes
+     */
+    static constexpr unsigned int split_dimensions_all = std::numeric_limits<unsigned>::max();
+
     /** Scheduler hints
      *
      * Collection of preferences set by the function regarding how to split a given workload
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
index 96e3ce832c..e3355ed2d5 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_fp32.cpp
@@ -26,6 +26,8 @@
 #include "gemm_hybrid.hpp"
 #include "gemm_implementation.hpp"
 #include "gemm_interleaved.hpp"
+#include "gemm_interleaved_2d.hpp"
+#include "gemm_interleaved_pretransposed_2d.hpp"
 #include "gemm_native.hpp"
 #include "gemv_batched.hpp"
 #include "gemv_native_transposed.hpp"
@@ -144,13 +146,31 @@ static const GemmImplementation<float, float> gemm_fp32_methods[] =
     [](const GemmArgs &args) { return new GemmInterleaved<interleaved_fp32_mla_3VLx8, float, float>(args); }
 },
 #endif // __ARM_FEATURE_SVE
+//Pretranpose, 2D split
+{
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "sgemm_12x8",
+    [](const GemmArgs &args) { return args._pretransposed_hint; },
+    [](const GemmArgs &args) { return args._pretransposed_hint; },
+    [](const GemmArgs &args) { return new GemmInterleavedPretransposed2d<sgemm_12x8, float, float>(args); }
+},
+//Tranpose, 2D split, no blockmanager
+{
+    GemmMethod::GEMM_INTERLEAVED_2D,
+    "sgemm_12x8",
+    [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; },
+    [](const GemmArgs &args) { return (!args._pretransposed_hint) && args._maxthreads >= 8; },
+    [](const GemmArgs &args) { return new GemmInterleaved2d<sgemm_12x8, float, float>(args); }
+},
+//Tranpose, 1D split, with blockmanager
 {
     GemmMethod::GEMM_INTERLEAVED,
     "sgemm_12x8",
-    nullptr,
-    nullptr,
+    [](const GemmArgs &args) { return (!args._pretransposed_hint); },
+    [](const GemmArgs &args) { return (!args._pretransposed_hint); },
     [](const GemmArgs &args) { return new GemmInterleaved<sgemm_12x8, float, float>(args); }
 },
+
 #endif // __aarch64__
 
 #ifdef __arm__
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
index c3abb04db7..0cb3160de4 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -142,8 +142,8 @@ public:
                 _window_range(iceildiv(args._Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmulti) { }
 
     // Interface implementation - Compulsory functions
-    unsigned int get_window_size() const override {
-        return _window_range.total_size();
+    ndrange_t get_window_size() const override {
+        return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
     }
 
     // This kernel can always be dynamically scheduled.
@@ -151,8 +151,7 @@ public:
         return true;
     }
 
-    // Execute
-    void execute(unsigned int start, unsigned int end, int threadid) override {
+    void execute_1d(unsigned int start, unsigned int end, int threadid) {
         UNUSED(threadid);
 #ifdef CYCLE_PROFILING
         profiler prof;
@@ -215,6 +214,17 @@ public:
         }
     }
 
+    // Execute
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        UNUSED(thread_locator);
+
+        const auto start = work_range.get_position(0);
+        const auto size  = work_range.get_size(0);
+        const auto stop  = start + size;
+
+        execute_1d(start, stop, threadid);
+    }
+
     // Interface implementation - pretransposed
     bool B_is_pretransposed() const override {
         return true;
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
index 22b6960baf..3d7ad99d1e 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_hybrid_quantized.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -149,8 +149,8 @@ public:
                 _qp (qp), _nthreads(args._maxthreads) { }
 
     // Interface implementation - Compulsory functions
-    unsigned int get_window_size() const override {
-        return _window_range.total_size();
+    ndrange_t get_window_size() const override {
+        return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
     }
 
     // This kernel can always be dynamically scheduled.
@@ -158,8 +158,7 @@ public:
         return true;
     }
 
-    // Execute
-    void execute(unsigned int start, unsigned int end, int threadid) override {
+    void execute_1d(unsigned int start, unsigned int end, int threadid) {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
@@ -234,6 +233,17 @@ public:
         }
     }
 
+    // Execute
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        UNUSED(thread_locator);
+
+        const auto start = work_range.get_position(0);
+        const auto size  = work_range.get_size(0);
+        const auto stop  = start + size;
+
+        execute_1d(start, stop, threadid);
+    }
+
     // Working space needed for intermediate result buffers.
     size_t get_working_size() const override {
         return (_nthreads * strategy::out_height() * _Nsize * sizeof(Tri));
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index efd984561d..4897bedf47 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -385,9 +385,9 @@ public:
     // out work in units of out_height.  Factor batches into the window, but
     // not multi for now (as this would cause problems with the buffer
     // manager).
-    unsigned int get_window_size() const override {
-        // _Mround is a multiple of out_height by definition.
-        return (_Mround / strategy::out_height()) * _nbatches;
+    ndrange_t get_window_size() const override {
+        auto m_win_size = (_Mround / strategy::out_height()) * _nbatches;
+        return { m_win_size, 1u, 1u, 1u, 1u, 1u };
     }
 
     // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
@@ -399,7 +399,7 @@ public:
     }
 
     // Execute
-    void execute(unsigned int start, unsigned int end, int threadid) override {
+    void execute_1d(unsigned int start, unsigned int end, int threadid) {
         if (_pretransposed) {
             execute_internal<true>(start, end, threadid);
         } else {
@@ -407,6 +407,16 @@ public:
         }
     }
 
+    //Execute
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        UNUSED(thread_locator);
+
+        const auto start = work_range.get_position(0);
+        const auto stop  = work_range.get_position_end(0);
+
+        execute_1d(start, stop, threadid);
+    }
+
     // Interface implementation - working space
     size_t get_working_size() const override {
         // In all cases, we need one A buffer plus a C buffer per thread.
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
new file mode 100644
index 0000000000..53f8e6c938
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_2d.hpp
@@ -0,0 +1,449 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include <algorithm>
+#include <cassert>
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND    64
+#define ROUND_UP(x)    ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm {
+
+template<typename strategy, typename To, typename Tr>
+class GemmInterleaved2d : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    /* const properties set by constructor */
+    const CPUInfo * const _ci;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const unsigned int _nbatches;
+    const unsigned int _nmulti;
+
+    const bool _trA;
+    const bool _trB;
+
+    const Activation _act;
+
+    const int _maxthreads;
+    int _nthreads;
+
+    /* Blocking info */
+    unsigned int _k_block=0;
+    unsigned int _x_block=0;
+
+    unsigned int _Mround_div=0;
+    unsigned int _Mround=0;
+    unsigned int _Nround_div=0;
+    unsigned int _Nround=0;
+
+    /* Working space, pretransposed buffer */
+    void *_working_space=nullptr;
+
+    /* We will need to walk through the blocks of B in a few contexts, so
+     * factor that out.  */
+    class blockwalker {
+    private:
+        /* Size loops, etc. based on our parent's configuration */
+        const GemmInterleaved2d<strategy, To, Tr> &_parent;
+
+        /* K, X and multi parameters for current iteration. */
+        unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
+
+        unsigned int _index=0;
+        bool _done=false;
+        bool _newkblock=true;
+        bool _newmulti=true;
+
+    public:
+        blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent)
+        : _parent(parent)
+        , _xmax { parent._Nsize }
+        { }
+
+        blockwalker(const GemmInterleaved2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
+        : _parent(parent)
+        , _x0   { x0   }
+        , _xmin { x0   }
+        , _xmax { xmax }
+        {
+            assert(_x0 <= _xmax);
+        }
+
+        unsigned int xmax() {
+            return std::min(_x0 + _parent._x_block, _xmax);
+        }
+
+        unsigned int kmax() {
+            return std::min(_k0 + _parent._k_block, _parent._Ksize);
+        }
+
+        /* Advance to the next block, return false at the end. */
+        bool advance(void) {
+            if (_done) {
+                return false;
+            }
+
+            _newkblock=false;
+            _x0 += _parent._x_block;
+            if (_x0 >= _xmax) {
+                _x0=_xmin;
+                _k0 += _parent._k_block;
+                if (_k0 >= _parent._Ksize) {
+                    _k0=0;
+                    _multi++;
+                    if (_multi >= _parent._nmulti) {
+                        _done=true;
+                        return false;
+                    }
+                    _newmulti=true;
+                }
+                _newkblock=true;
+            }
+            _index++;
+
+            return true;
+        }
+
+        unsigned int k0(void) { return _k0; }
+        unsigned int x0(void) { return _x0; }
+        unsigned int multi(void) { return _multi; }
+        unsigned int index(void) { return _index; }
+        bool done(void) { return _done; }
+        bool newkblock(void) { return _newkblock; }
+    };
+
+    // A working size: One of these needed, regardless of thread count.  Divided according to window.
+    size_t get_a_working_size() const {
+        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
+    }
+
+    // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
+    size_t get_b_working_size() const {
+        return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
+    }
+
+    // C working size: One needed per thread.
+    size_t get_c_working_size() const {
+        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+    }
+
+    void execute_transpose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
+        UNUSED(mthreadid);
+
+        strategy strat(_ci);
+
+        /* Translate 'start' and 'end' into a position within the batches and rows. */
+        const unsigned int window_per_batch = _Mround / strategy::out_height();
+        unsigned int batch_0   = m_start / window_per_batch;
+        unsigned int batch_end = m_end   / window_per_batch;
+
+        /* Compute the M values to operate on */
+        unsigned int m_0   = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
+        unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
+
+        unsigned int n_0   = std::min(this->_Nsize, strategy::out_width() * n_start);
+        unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
+
+        blockwalker current(*this, n_0, n_max);
+
+        /* get workspace as int8_t */
+        assert(_working_space);
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+        auto c_panel_start = working_space_bytes;
+        auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
+        auto b_panel_start = a_panel_start + get_a_working_size() * _maxthreads;
+
+        auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
+        auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * nthreadid);
+        auto b_panel = reinterpret_cast<Toi *>(b_panel_start + get_b_working_size() * threadid);
+
+
+        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+
+        int kern_k = 0;
+        for (;!current.done();current.advance()) {
+              const int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+            /*
+             * The entirity of A^kblock is transpose upfront and computed against individual
+             * blocks of B (xblock)
+             *
+             * Therefore, we only need to retranspose when k_block progresses
+             */
+            if (current.newkblock()) {
+                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                    if (first_m >= last_m)
+                        continue;
+
+                    auto a_thread_panel_in  = this->_Aptr
+                                            + (batch * this->_A_batch_stride)
+                                            + (current.multi() * this->_A_multi_stride);
+
+                    auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
+
+                    strat.transforms.PrepareA(
+                        a_thread_panel_out,
+                        a_thread_panel_in,
+                        this->_lda,
+                        first_m,
+                        last_m,
+                        current.k0(),
+                        current.kmax(),
+                        _trA);
+                }
+
+                kern_k = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
+                kern_k *= strat.k_unroll();
+            }
+
+            auto *b_panel_in = this->_Bptr + (current.multi() * this->_B_multi_stride);
+
+            strat.transforms.PrepareB(
+                b_panel,    //dst
+                b_panel_in, //src
+                this->_ldb,
+                current.x0(),   //idx from
+                current.xmax(), //idx to
+                current.k0(),
+                current.kmax(),
+                _trB);
+
+            //Iterate over the batches
+            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                if (first_m >= last_m)
+                    continue;
+
+                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+
+
+                //Iterate over the inerleaved rows of the packed A matrix
+                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
+                    unsigned int ymax = std::min(_Msize, y + strategy::out_height());
+
+                    strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+                    a_ptr += (strategy::out_height() * kern_k);
+
+                    const bool first_pass = current.k0()==0;
+                    const bool last_pass  = current.kmax()==_Ksize;
+
+                    auto c_panel_out = this->_Cptr
+                                     + this->_C_batch_stride * batch
+                                     + this->_C_multi_stride * current.multi();
+
+                    auto bias        = (first_pass && this->_bias)
+                                     ? this->_bias + (current.multi() * this->_bias_multi_stride)
+                                     : nullptr;
+
+                    auto act        = last_pass ? _act : Activation();
+
+                    strat.transforms.Merge(
+                        c_panel_out,
+                        c_panel,
+                        this->_ldc,
+                        y,
+                        ymax,
+                        current.x0(),
+                        current.xmax(),
+                        bias,
+                        act,
+                        !first_pass);  //Append
+                }
+            }
+        }
+    }
+public:
+    GemmInterleaved2d(GemmInterleaved2d &) = delete;
+    GemmInterleaved2d & operator= (GemmInterleaved2d &) = delete;
+
+    /* Constructor */
+    /* Constructor */
+    GemmInterleaved2d(const GemmArgs &args)
+    :    _ci(args._ci)
+    ,    _Msize(args._Msize)
+    ,    _Nsize(args._Nsize)
+    ,    _Ksize(args._Ksize)
+    ,    _nbatches(args._nbatches)
+    ,    _nmulti(args._nmulti)
+    ,    _trA(args._trA)
+    ,    _trB(args._trB)
+    ,    _act(args._act)
+    ,    _maxthreads(args._maxthreads)
+    ,    _nthreads(args._maxthreads) 
+
+    // Work out the rounded size of M - needed for some buffers.
+    ,    _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
+    ,    _Mround     ( _Mround_div * strategy::out_height()     )
+
+    ,    _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
+    ,    _Nround     ( _Nround_div * strategy::out_width()     )
+    {
+        const unsigned int L1_size = _ci->get_L1_cache_size();
+        const unsigned int L2_size = _ci->get_L2_cache_size();
+
+        assert(_maxthreads > 0);
+
+        // Work out blocking parameters, or override from provided GemmConfig
+        if (args._cfg && args._cfg->inner_block_size) {
+            _k_block = args._cfg->inner_block_size;
+        } else {
+            // k_block: Find out how much of the larger array can be loaded into half the cache.
+            // This should account for associative caches.
+            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+            // Needs to be (at least a single) multiple of the K unroll level.
+            _k_block /= strategy::k_unroll();
+            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+
+            // Now tune to presented problem size; this is how many blocks we need.
+            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
+
+            // So divide the space equally into that many blocks.
+            _k_block = iceildiv(_Ksize, num_k_blocks);
+
+            // And round UP to the K unroll level required.
+            _k_block = iceildiv(_k_block, strategy::k_unroll());
+            _k_block *= strategy::k_unroll();
+        }
+
+        if (args._cfg && args._cfg->outer_block_size) {
+            _x_block = args._cfg->outer_block_size;
+        } else {
+            // x_block: Work out how many rows (of length k_block) will fit in the L2
+            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+                      (sizeof(Toi) * _k_block);
+
+            // Needs to be (at least a single) multiple of the kernel output width.
+            _x_block /= strategy::out_width();
+            _x_block = std::max(_x_block, 1U) * strategy::out_width();
+
+            // And tune to the presented problem size.
+            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
+            _x_block = iceildiv(_Nsize, num_x_blocks);
+
+            _x_block = iceildiv(_x_block, strategy::out_width());
+            _x_block *= strategy::out_width();
+        }
+
+        // Work out the rounded size of M - needed for some buffers.
+    }
+
+    // Interface implementation - Compulsory functions
+    ndrange_t get_window_size() const override {
+        unsigned m = (_Mround / strategy::out_height()) * _nbatches;
+        unsigned n = _Nround_div;
+
+        return { m, n, 1u, 1u, 1u, 1u };
+    }
+
+    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+    void set_nthreads(int nthreads) override {
+        _nthreads = std::min(nthreads, _maxthreads);
+    }
+
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        /*
+         * This particular GEMM implementation can only be broken up over the M & N
+         * dimensions, we inform the frame work of this limitation via the get_window_size function
+         */
+        assert(ndrange_popcount(work_range) <= 2);
+
+        const auto m_start = work_range.get_position(0);
+        const auto n_start = work_range.get_position(1);
+        const auto m_size  = work_range.get_size(0);
+        const auto n_size  = work_range.get_size(1);
+        const auto m_end   = m_start + m_size;
+        const auto n_end   = n_start + n_size;
+
+        const auto m_threadid = thread_locator.get_position(0);
+        const auto n_threadid = thread_locator.get_position(1);
+
+        execute_transpose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
+    }
+
+    std::size_t get_working_size()const override {
+        /*
+         * Because we do not know how schedular will break up
+         * the task, we need to ensure that alloc enough
+         * space to be able to handle the case where every thread
+         * is parallelised across B AND also every thrread is parallelised across A
+         *
+         * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
+         * If we parallelise across B, then we only need 64 buffer of B and
+          */
+        return get_c_working_size() * _maxthreads
+             + get_a_working_size() * _maxthreads
+             + get_b_working_size() * _maxthreads
+             + 64; //to account for cacheline alignment
+    }
+
+
+    void set_working_space(void *working_space) override {
+        // Make sure everything ends up cache line aligned
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+        intptr_t working_space_int  = reinterpret_cast<intptr_t>(working_space);
+
+        size_t diff=0;
+
+        if (working_space_int & 0x3F) {
+            diff = 0x40 - (working_space_int & 0x3F);
+        }
+
+        working_space_bytes += diff;
+
+        _working_space = reinterpret_cast<void *>(working_space_bytes);
+    }
+
+    ~GemmInterleaved2d() override { }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
new file mode 100644
index 0000000000..eff4877198
--- /dev/null
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved_pretransposed_2d.hpp
@@ -0,0 +1,514 @@
+/*
+ * Copyright (c) 2020 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#pragma once
+
+#include "arm_gemm.hpp"
+#include "utils.hpp"
+
+#include "mergeresults.hpp"
+#include "transform.hpp"
+
+#ifdef CYCLE_PROFILING
+#include "profiler.hpp"
+#endif
+
+#include <algorithm>
+#include <cassert>
+
+// Some macros used to decide how much working space to allocate.
+// Round allocations up to the next cache line.
+#define ALLOC_ROUND    64
+#define ROUND_UP(x)    ((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
+
+// Implementation of the GemmCommon abstract class.
+//
+// This implementation interleaves the source matrices in blocks - good for
+// larger matrices.
+namespace arm_gemm {
+
+template<typename strategy, typename To, typename Tr>
+class GemmInterleavedPretransposed2d : public GemmCommon<To, Tr> {
+    typedef typename strategy::operand_type Toi;
+    typedef typename strategy::result_type Tri;
+
+    /* const properties set by constructor */
+    const CPUInfo * const _ci;
+
+    const unsigned int _Msize;
+    const unsigned int _Nsize;
+    const unsigned int _Ksize;
+
+    const unsigned int _nbatches;
+    const unsigned int _nmulti;
+
+    const bool _trA;
+    const bool _trB;
+
+    const Activation _act;
+
+    const int _maxthreads;
+    int _nthreads;
+
+    /* Blocking info */
+    unsigned int _k_block=0;
+    unsigned int _x_block=0;
+
+    unsigned int _Mround_div=0;
+    unsigned int _Mround=0;
+    unsigned int _Nround_div=0;
+    unsigned int _Nround=0;
+
+    /* Working space, pretransposed buffer */
+    const Toi *_B_transposed=nullptr;
+    void *_working_space=nullptr;
+
+    /* We will need to walk through the blocks of B in a few contexts, so
+     * factor that out.  */
+    class blockwalker {
+    private:
+        /* Size loops, etc. based on our parent's configuration */
+        const GemmInterleavedPretransposed2d<strategy, To, Tr> &_parent;
+
+        /* K, X and multi parameters for current iteration. */
+        unsigned int _k0=0, _x0=0, _xmin=0, _xmax=0, _multi=0;
+
+        unsigned int _index=0;
+        bool _done=false;
+        bool _newkblock=true;
+        bool _newmulti=true;
+
+    public:
+        blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent)
+        : _parent(parent)
+        , _xmax { parent._Nsize }
+        { }
+
+        blockwalker(const GemmInterleavedPretransposed2d<strategy, To, Tr> &parent, unsigned int x0, unsigned int xmax)
+        : _parent(parent)
+        , _x0   { x0   }
+        , _xmin { x0   }
+        , _xmax { xmax }
+        {
+            assert(_x0 <= _xmax);
+        }
+
+        unsigned int xmax() {
+            return std::min(_x0 + _parent._x_block, _xmax);
+        }
+
+        unsigned int kmax() {
+            return std::min(_k0 + _parent._k_block, _parent._Ksize);
+        }
+
+        /* Advance to the next block, return false at the end. */
+        bool advance(void) {
+            if (_done) {
+                return false;
+            }
+
+            _newkblock=false;
+            _x0 += _parent._x_block;
+            if (_x0 >= _xmax) {
+                _x0=_xmin;
+                _k0 += _parent._k_block;
+                if (_k0 >= _parent._Ksize) {
+                    _k0=0;
+                    _multi++;
+                    if (_multi >= _parent._nmulti) {
+                        _done=true;
+                        return false;
+                    }
+                    _newmulti=true;
+                }
+                _newkblock=true;
+            }
+            _index++;
+
+            return true;
+        }
+
+        unsigned int k0(void) { return _k0; }
+        unsigned int x0(void) { return _x0; }
+        unsigned int multi(void) { return _multi; }
+        unsigned int index(void) { return _index; }
+        bool done(void) { return _done; }
+        bool newkblock(void) { return _newkblock; }
+    };
+
+    // A working size: One of these needed, regardless of thread count.  Divided according to window.
+    size_t get_a_working_size() const {
+        return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches) * 2;
+    }
+
+    // As B will be pretranspose we do not need to alloc any space for it
+    size_t get_b_working_size() const {
+        return 0;
+    }
+
+    // C working size: One needed per thread.
+    size_t get_c_working_size() const {
+        return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height());
+    }
+
+    // Internal execute function.
+    // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
+    void execute_pretranspose(unsigned int m_start, unsigned int m_end, unsigned int n_start, unsigned int n_end, int threadid, int mthreadid, int nthreadid) {
+        /* Make sure we've been set up correctly. */
+        assert(_B_transposed);
+        assert(_working_space);
+        assert(this->_Aptr);
+        assert(this->_Cptr);
+
+        UNUSED(mthreadid);
+        UNUSED(nthreadid);
+
+#ifdef CYCLE_PROFILING
+        profiler prof;
+#endif
+        strategy strat(_ci);
+
+        /* Translate 'start' and 'end' into a position within the batches and rows. */
+        const unsigned int window_per_batch = _Mround / strategy::out_height();
+        unsigned int batch_0   = m_start / window_per_batch;
+        unsigned int batch_end = m_end   / window_per_batch;
+
+        /* Compute the M values to operate on */
+        unsigned int m_0   = (m_start - (batch_0 * window_per_batch)) * strategy::out_height();
+        unsigned int m_max = (m_end - (batch_end * window_per_batch)) * strategy::out_height();
+
+        unsigned int n_0   = std::min(this->_Nsize, strategy::out_width() * n_start);
+        unsigned int n_max = std::min(this->_Nsize, strategy::out_width() * n_end);
+
+        blockwalker current(*this, n_0, n_max);
+
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
+
+        auto c_panel_start = working_space_bytes;
+        auto a_panel_start = c_panel_start + get_c_working_size() * _maxthreads;
+
+        auto c_panel = reinterpret_cast<Tri *>(c_panel_start + get_c_working_size() * threadid);
+        auto a_panel = reinterpret_cast<Toi *>(a_panel_start + get_a_working_size() * threadid);
+
+        /* B^t is stored in interleaved panels separated by their K-block component
+         * we want to store a pointer to the start of the current k-page
+         * then when we come to the next k-block we just add the size of the previous to
+         * this base pointer
+         */
+        const Toi *b_panel_start = _B_transposed;
+        // b_panels stores a pointer to the start of our current block inside of the k-block
+        const Toi *b_panel       = b_panel_start;
+
+        // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
+        unsigned b_page_size = 0;
+        int kern_k = 0;
+        for (;!current.done();current.advance()) {
+            int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width());
+
+            if (current.newkblock()) {
+                kern_k         = iceildiv(current.kmax() - current.k0(), strategy::k_unroll());
+                kern_k        *= strat.k_unroll();
+
+                unsigned b_thread_start_offset = iceildiv(current.x0(), strategy::out_width());
+
+                b_panel_start += b_page_size;
+                b_panel        = b_panel_start + (b_thread_start_offset * strat.out_width() * kern_k);
+                b_page_size    = _Nround * kern_k;
+
+                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                    unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                    if (first_m >= last_m)
+                        continue;
+
+                    auto a_thread_panel_in  = this->_Aptr
+                                            + (batch * this->_A_batch_stride)
+                                            + (current.multi() * this->_A_multi_stride);
+
+                    auto a_thread_panel_out = a_panel + ((batch * _Mround + first_m) * _k_block);
+
+                    strat.transforms.PrepareA(
+                        a_thread_panel_out,
+                        a_thread_panel_in,
+                        this->_lda,
+                        first_m,
+                        last_m,
+                        current.k0(),
+                        current.kmax(),
+                        _trA);
+                }
+            }
+
+            /* Do the actual work. */
+            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
+                unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
+
+                const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
+
+                if (first_m >= last_m)
+                    continue;
+
+                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height()) {
+                    unsigned int ymax = std::min(_Msize, y + strategy::out_height());
+
+                    strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
+                    a_ptr += (strategy::out_height() * kern_k);
+
+                    /* Only activate on last pass, only add bias on first pass, ask for accumulation on any non-first pass */
+                    const bool first_pass = current.k0()==0;
+                    const bool last_pass  = current.kmax()==_Ksize;
+
+                    auto c_panel_out = this->_Cptr
+                                     + this->_C_batch_stride * batch
+                                     + this->_C_multi_stride * current.multi();
+
+                    auto bias        = (first_pass && this->_bias)
+                                     ? this->_bias + (current.multi() * this->_bias_multi_stride)
+                                     : nullptr;
+
+                    auto act        = last_pass ? _act : Activation();
+
+                    strat.transforms.Merge(
+                        c_panel_out,
+                        c_panel,
+                        this->_ldc,
+                        y,
+                        ymax,
+                        current.x0(),
+                        current.xmax(),
+                        bias,
+                        act,
+                        !first_pass);  //Append
+                }
+            }
+
+            b_panel += (bblocks * strat.out_width() * kern_k);
+        }
+    }
+
+public:
+    GemmInterleavedPretransposed2d(GemmInterleavedPretransposed2d &) = delete;
+    GemmInterleavedPretransposed2d & operator= (GemmInterleavedPretransposed2d &) = delete;
+
+    /* Constructor */
+    GemmInterleavedPretransposed2d(const GemmArgs &args)
+    :    _ci(args._ci)
+    ,    _Msize(args._Msize)
+    ,    _Nsize(args._Nsize)
+    ,    _Ksize(args._Ksize)
+    ,    _nbatches(args._nbatches)
+    ,    _nmulti(args._nmulti)
+    ,    _trA(args._trA)
+    ,    _trB(args._trB)
+    ,    _act(args._act)
+    ,    _maxthreads(args._maxthreads)
+    ,    _nthreads(args._maxthreads) 
+
+    // Work out the rounded size of M - needed for some buffers.
+    ,    _Mround_div ( iceildiv(_Msize, strategy::out_height()) )
+    ,    _Mround     ( _Mround_div * strategy::out_height()     )
+
+    ,    _Nround_div ( iceildiv(_Nsize, strategy::out_width()) )
+    ,    _Nround     ( _Nround_div * strategy::out_width()     )
+    {
+
+        assert(args._pretransposed_hint);
+        assert(_maxthreads > 0);
+
+        const unsigned int L1_size = _ci->get_L1_cache_size();
+        const unsigned int L2_size = _ci->get_L2_cache_size();
+
+        // Work out blocking parameters, or override from provided GemmConfig
+        if (args._cfg && args._cfg->inner_block_size) {
+            _k_block = args._cfg->inner_block_size;
+        } else {
+            // k_block: Find out how much of the larger array can be loaded into half the cache.
+            // This should account for associative caches.
+            _k_block = (L1_size / 2) / (sizeof(Toi) * (std::max(strategy::out_width(), strategy::out_height())));
+
+            // Needs to be (at least a single) multiple of the K unroll level.
+            _k_block /= strategy::k_unroll();
+            _k_block = std::max(_k_block, 1U) * strategy::k_unroll();
+
+            // Now tune to presented problem size; this is how many blocks we need.
+            unsigned int num_k_blocks = iceildiv(_Ksize, _k_block);
+
+            // So divide the space equally into that many blocks.
+            _k_block = iceildiv(_Ksize, num_k_blocks);
+
+            // And round UP to the K unroll level required.
+            _k_block = iceildiv(_k_block, strategy::k_unroll());
+            _k_block *= strategy::k_unroll();
+        }
+
+        if (args._cfg && args._cfg->outer_block_size) {
+            _x_block = args._cfg->outer_block_size;
+        } else {
+            // x_block: Work out how many rows (of length k_block) will fit in the L2
+            // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
+            _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width() + strategy::out_height()))) /
+                      (sizeof(Toi) * _k_block);
+
+            // Needs to be (at least a single) multiple of the kernel output width.
+            _x_block /= strategy::out_width();
+            _x_block = std::max(_x_block, 1U) * strategy::out_width();
+
+            // And tune to the presented problem size.
+            unsigned int num_x_blocks = iceildiv(_Nsize, _x_block);
+            _x_block = iceildiv(_Nsize, num_x_blocks);
+
+            _x_block = iceildiv(_x_block, strategy::out_width());
+            _x_block *= strategy::out_width();
+        }
+    }
+
+    // Interface implementation - Compulsory functions
+    ndrange_t get_window_size() const override {
+        unsigned m = (_Mround / strategy::out_height()) * _nbatches;
+        unsigned n = _Nround_div;
+
+        return { m, n, 1u, 1u, 1u, 1u };
+    }
+
+    // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
+    void set_nthreads(int nthreads) override {
+        _nthreads = std::min(nthreads, _maxthreads);
+    }
+
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        /* This particular GEMM implementation can only be broken up over the M & N
+         * dimensions, we inform the frame work of this limitation via the get_window_size function
+         */
+        assert(ndrange_popcount(work_range) <= 2);
+
+        const auto m_start = work_range.get_position(0);
+        const auto n_start = work_range.get_position(1);
+        const auto m_size  = work_range.get_size(0);
+        const auto n_size  = work_range.get_size(1);
+        const auto m_end   = m_start + m_size;
+        const auto n_end   = n_start + n_size;
+
+        const auto m_threadid = thread_locator.get_position(0);
+        const auto n_threadid = thread_locator.get_position(1);
+
+        execute_pretranspose(m_start, m_end, n_start, n_end, threadid, m_threadid, n_threadid);
+    }
+
+    std::size_t get_working_size()const override {
+        /* Because we do not know how schedular will break up
+         * the task, we need to ensure that alloc enough
+         * space to be able to handle the case where every thread
+         * is parallelised across B AND also every thrread is parallelised across A
+         *
+         * If we parallelise across A, then we only need one buffer of A and 64 buffers of B
+         * If we parallelise across B, then we only need 64 buffer of B and
+         */
+        return get_c_working_size() * _maxthreads
+             + get_a_working_size() * _maxthreads
+             + 64; //to account for cacheline alignment
+    }
+
+
+    void set_working_space(void *working_space) override {
+        // Make sure everything ends up cache line aligned
+        int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
+        intptr_t working_space_int  = reinterpret_cast<intptr_t>(working_space);
+
+        size_t diff=0;
+
+        if (working_space_int & 0x3F) {
+            diff = 0x40 - (working_space_int & 0x3F);
+        }
+
+        working_space_bytes += diff;
+
+        _working_space = reinterpret_cast<void *>(working_space_bytes);
+    }
+
+    // Interface implementation - pretransposed
+    bool B_is_pretransposed() const override {
+        return true;
+    }
+
+    bool B_pretranspose_required() const override {
+        return _B_transposed==nullptr;
+    }
+
+    // TODO: this could almost certainly be considerably simpler.
+    size_t get_B_pretransposed_array_size() const override {
+        size_t total=0;
+        blockwalker current(*this);
+
+        do {
+            /* Figure out the size of each block. */
+            unsigned int x_size = (current.xmax() - current.x0());
+            unsigned int k_size = (current.kmax() - current.k0());
+
+            /* Round sizes up as needed. */
+            x_size = iceildiv(x_size, strategy::out_width());
+            x_size *= strategy::out_width();
+
+            k_size = iceildiv(k_size, strategy::k_unroll());
+            k_size *= strategy::k_unroll();
+
+            total += x_size * k_size * sizeof(Toi);
+        } while (current.advance());
+
+        return total;
+    }
+
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
+        blockwalker current(*this);
+        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+        _B_transposed = buffer;
+        strategy strat(_ci);
+
+        do {
+            /* Figure out the size of each block. */
+            unsigned int x_size = (current.xmax() - current.x0());
+            unsigned int k_size = (current.kmax() - current.k0());
+
+            /* Round sizes up as needed. */
+            x_size = iceildiv(x_size, strategy::out_width());
+            x_size *= strategy::out_width();
+
+            k_size = iceildiv(k_size, strategy::k_unroll());
+            k_size *= strategy::k_unroll();
+
+            strat.transforms.PrepareB(buffer, B + (current.multi() * B_multi_stride), ldb,
+                                      current.x0(), current.xmax(), current.k0(), current.kmax(), _trB);
+
+            buffer += (x_size * k_size);
+        } while (current.advance());
+    }
+
+    void set_pretransposed_B_data(void *in_buffer) override {
+        _B_transposed = reinterpret_cast<Toi *>(in_buffer);
+    }
+
+    ~GemmInterleavedPretransposed2d() override { }
+};
+
+} // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
index fe6ebef045..c2f742b5cf 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_native.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -87,8 +87,8 @@ public:
                  _window_range(iceildiv(_Msize, strategy::out_height()), _nbatches, iceildiv(_Nsize, _n_block), _nmultis) { }
 
     // Window is amount per multi multiplied by total number of multis.
-    unsigned int get_window_size() const override {
-        return _window_range.total_size();
+    ndrange_t get_window_size() const override {
+        return { _window_range.total_size(), 1u, 1u, 1u, 1u, 1u };
     }
 
     // Native GEMMs can always be dynamically scheduled (whether requested or not)
@@ -97,7 +97,7 @@ public:
     }
 
     // Actually execute the GEMM.
-    void execute(unsigned int start, unsigned int end, int) override {
+    void execute_1d(unsigned int start, unsigned int end, int) {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
@@ -139,6 +139,16 @@ public:
             }
         } while (p.next_dim1());
     }
+
+    //Execute
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        UNUSED(thread_locator);
+
+        const auto start = work_range.get_position(0);
+        const auto stop  = work_range.get_position_end(0);
+
+        execute_1d(start, stop, threadid);
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
index be2f5614be..939788ed8d 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_batched.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -58,7 +58,7 @@ public:
         UNUSED(ldc);
     }
 
-    unsigned int get_window_size() const override {
+    ndrange_t get_window_size() const override {
         return _subgemm->get_window_size();
     }
 
@@ -66,8 +66,8 @@ public:
         _subgemm->set_nthreads(nthreads);
     }
 
-    void execute(unsigned int start, unsigned int end, int threadid) override {
-        _subgemm->execute(start, end, threadid);
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        _subgemm->execute(work_range, thread_locator, threadid);
     }
 
     size_t get_working_size() const override {
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
index 49681ec404..190f4aa643 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_native_transposed.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -72,12 +72,12 @@ public:
     }
 
     // Window is number of out_width blocks times number of multis.
-    unsigned int get_window_size() const override {
-        return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
+    ndrange_t get_window_size() const override {
+        return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
     }
 
     // Actually execute the GEMV.
-    void execute(unsigned int start, unsigned int end, int) override {
+    void execute_1d(unsigned int start, unsigned int end, int) {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
@@ -127,6 +127,17 @@ public:
             }
         }
     }
+
+    // Execute
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        UNUSED(thread_locator);
+
+        const auto start = work_range.get_position(0);
+        const auto size  = work_range.get_size(0);
+        const auto stop  = start + size;
+
+        execute_1d(start, stop, threadid);
+    }
 };
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
index 26fdfba8ff..7f52ac5a14 100644
--- a/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemv_pretransposed.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017-2019 ARM Limited.
+ * Copyright (c) 2017-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -86,12 +86,12 @@ public:
     }
 
     // Window is number of out_width blocks, times number of multis.
-    unsigned int get_window_size() const override {
-        return iceildiv(_Nsize, strategy::out_width()) * _nmultis;
+    ndrange_t get_window_size() const override {
+        return { iceildiv(_Nsize, strategy::out_width()) * _nmultis, 1u, 1u, 1u, 1u, 1u };
     }
 
     // Actually execute the GEMV.
-    void execute(unsigned int start, unsigned int end, int) override {
+    void execute_1d(unsigned int start, unsigned int end, int) {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
@@ -145,6 +145,17 @@ public:
         }
     }
 
+    // Execute
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        UNUSED(thread_locator);
+
+        const auto start = work_range.get_position(0);
+        const auto size  = work_range.get_size(0);
+        const auto stop  = start + size;
+
+        execute_1d(start, stop, threadid);
+    }
+
     /* Pretransposed interface implementation */
     bool B_is_pretransposed() const override {
         return true;
diff --git a/src/core/NEON/kernels/arm_gemm/ndrange.hpp b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
index 20824dfc8b..0c068db011 100644
--- a/src/core/NEON/kernels/arm_gemm/ndrange.hpp
+++ b/src/core/NEON/kernels/arm_gemm/ndrange.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2020 Arm Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -23,16 +23,19 @@
  */
 #pragma once
 
+#include <array>
 #include <algorithm>
 #include <initializer_list>
 
+#include <cassert>
+
 namespace arm_gemm {
 
 template<unsigned int D>
 class NDRange {
 private:
-    unsigned int m_sizes[D];
-    unsigned int m_totalsizes[D];
+    std::array<unsigned int, D> m_sizes {};
+    std::array<unsigned int, D> m_totalsizes {};
 
     class NDRangeIterator {
     private:
@@ -81,8 +84,25 @@ private:
     };
 
 public:
+    NDRange& operator=(const NDRange& rhs)=default;
+    NDRange(const NDRange& rhs)           =default;
+
     template <typename... T>
-    NDRange(T... ts) : m_sizes{ts...} {
+    NDRange(T... ts)
+    : m_sizes{ts...}
+    {
+        unsigned int t=1;
+
+        for (unsigned int i=0; i<D; i++) {
+            t *= m_sizes[i];
+
+            m_totalsizes[i] = t;
+        }
+    }
+
+    NDRange(const std::array<unsigned int, D>& n)
+    : m_sizes{n}
+    {
         unsigned int t=1;
 
         for (unsigned int i=0; i<D; i++) {
@@ -105,4 +125,61 @@ public:
     }
 };
 
+/** NDCoordinate builds upon a range, but specifies a starting position
+ * in addition to a size which it inherits from NDRange
+ */
+template<unsigned int N>
+class NDCoordinate : public NDRange<N> {
+    using int_t     =unsigned int;
+    using ndrange_t = NDRange<N>;
+
+    std::array<int_t, N> m_positions {};
+public:
+    NDCoordinate& operator=(const NDCoordinate& rhs)=default;
+    NDCoordinate(const NDCoordinate& rhs)           =default;
+    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list)
+    {
+        std::array<int_t, N> sizes;
+
+        std::size_t i = 0;
+        for(auto& p : list) {
+            m_positions[i]= p.first;
+            sizes[i++]    = p.second;
+        }
+
+        //update the parents sizes
+        static_cast<ndrange_t&>(*this) = ndrange_t(sizes);
+    }
+
+    int_t get_position(int_t d) const {
+        assert(d < m_positions.size());
+        return m_positions[d];
+    }
+
+    void set_position(int_t d, int_t v) {
+        assert(d < size(m_positions));
+        assert(v < ndrange_t::get_size(d));
+
+        m_positions[d] = v;
+    }
+
+    int_t get_position_end(int_t d) const {
+        return get_position(d) + NDRange<N>::get_size(d);
+    }
+}; //class NDCoordinate
+
+/** @returns the number of dimensions in the NDRange which have none-1 values
+ * IE there is actual work in these dimensions that can be broken up
+ */
+template<unsigned int N>
+std::size_t ndrange_popcount(const NDRange<N>& ndr) {
+    std::size_t count = 0;
+
+    for(unsigned int d = 0; d != N; ++d) {
+        if(ndr.get_size(d) != 1)
+            ++count;
+    }
+    return count;
+}
+
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
index 345060f206..18f030fec0 100644
--- a/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
+++ b/src/core/NEON/kernels/arm_gemm/quantize_wrapper.hpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2019 ARM Limited.
+ * Copyright (c) 2019-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -148,7 +148,7 @@ public:
         set_child_arrays();
     }
 
-    unsigned int get_window_size() const override {
+    ndrange_t get_window_size() const override {
         return _subgemm->get_window_size();
     }
 
@@ -158,8 +158,9 @@ public:
         _args._maxthreads = nthreads;
     }
 
-    void execute(unsigned int start, unsigned int end, int threadid) override {
-        _subgemm->execute(start, end, threadid);
+    // Execute
+    void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) override {
+        _subgemm->execute(work_range, thread_locator, threadid);
         if (!_args._pretransposed_hint) {
             col_sums_runtime(threadid);
         }
diff --git a/src/runtime/CPP/CPPScheduler.cpp b/src/runtime/CPP/CPPScheduler.cpp
index e684eeee98..0a03497cb9 100644
--- a/src/runtime/CPP/CPPScheduler.cpp
+++ b/src/runtime/CPP/CPPScheduler.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2016-2019 ARM Limited.
+ * Copyright (c) 2016-2020 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -71,6 +71,61 @@ private:
     const unsigned int _end;
 };
 
+/** Given two dimensions and a maxium number of threads to utilise, calcualte the best
+ * combination of threads that fit in (mutliplied together) max_threads.
+ *
+ * This algorithm assumes that work in either of the dimensions is equally difficult
+ * to compute
+ *
+ * @returns [m_nthreads, n_nthreads] A pair of the threads that should be used in each dimension
+ */
+std::pair<unsigned, unsigned> split_2d(unsigned max_threads, std::size_t m, std::size_t n)
+{
+    /*
+     * We want the same ratio of threads in M & N to the ratio of m and n problem size
+     *
+     * Therefore:    mt/nt == m/n    where mt*nt == max_threads
+     *
+     *             max_threads/nt = mt    &    (max_threads/nt) * (m/n) = nt
+     *          nt^2 = max_threads * (m/n)
+     *          nt = sqrt( max_threads * (m/n) )
+     */
+    //ratio of m to n in problem dimensions
+    double ratio = m / static_cast<double>(n);
+
+    // nt = sqrt(max_threads * (m / n) )
+    const unsigned adjusted = std::round(
+                    std::sqrt(max_threads * ratio));
+
+    //find the nearest factor of max_threads
+    for(unsigned i = 0; i!= adjusted; ++i)
+    {
+        //try down
+        const unsigned adj_down = adjusted - i;
+        if(max_threads % adj_down == 0)
+        {
+            return { adj_down, max_threads / adj_down };
+        }
+
+        //try up
+        const unsigned adj_up = adjusted + i;
+        if(max_threads % adj_up == 0)
+        {
+            return { adj_up, max_threads / adj_up };
+        }
+    }
+
+    //we didn't find anything so lets bail out with maxes biased to the largest dimension
+    if(m > n)
+    {
+         return{ std::min<unsigned>(m, max_threads), 1 };
+    }
+    else
+    {
+        return{ 1, std::min<unsigned>(n, max_threads) };
+    }
+}
+
 /** Execute workloads[info.thread_id] first, then call the feeder to get the index of the next workload to run.
  *
  * Will run workloads until the feeder reaches the end of its range.
@@ -314,50 +369,95 @@ void CPPScheduler::schedule(ICPPKernel *kernel, const Hints &hints)
     ARM_COMPUTE_ERROR_ON_MSG(!kernel, "The child class didn't set the kernel");
 
     const Window      &max_window     = kernel->window();
-    const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
-    const unsigned int num_threads    = std::min(num_iterations, _impl->_num_threads);
 
-    if(num_iterations == 0)
+    if(hints.split_dimension() == IScheduler::split_dimensions_all)
     {
-        return;
-    }
+        /*
+         * if the split dim is size_t max then this signals we should parallelise over
+         * all dimensions
+         */
+        const std::size_t m = max_window.num_iterations(Window::DimX);
+        const std::size_t n = max_window.num_iterations(Window::DimY);
+
+       //in c++17 this can be swapped for   auto [ m_threads, n_threads ] = split_2d(...
+        unsigned m_threads, n_threads;
+        std::tie(m_threads, n_threads) = split_2d(_impl->_num_threads, m, n);
+
+        std::vector<IScheduler::Workload> workloads;
+        for(unsigned int ni  = 0; ni != n_threads; ++ni)
+        {
+            for(unsigned int mi  = 0; mi != m_threads; ++mi)
+            {
+                workloads.push_back(
+                    [ ni, mi, m_threads, n_threads, &max_window, &kernel ]
+                    (const ThreadInfo & info)
+                    {
+                        //narrow the window to our mi-ni workload
+                        Window win = max_window.split_window(Window::DimX, mi, m_threads)
+                                               .split_window(Window::DimY, ni, n_threads);
 
-    if(!kernel->is_parallelisable() || num_threads == 1)
-    {
-        ThreadInfo info;
-        info.cpu_info = &_cpu_info;
-        kernel->run(max_window, info);
+                        win.validate();
+
+                        Window thread_locator;
+                        thread_locator.set(Window::DimX, Window::Dimension(mi, m_threads));
+                        thread_locator.set(Window::DimY, Window::Dimension(ni, n_threads));
+
+                        thread_locator.validate();
+
+                        kernel->run_nd(win, info, thread_locator);
+                    }
+                );
+            }
+        }
+        run_workloads(workloads);
     }
     else
     {
-        unsigned int num_windows = 0;
-        switch(hints.strategy())
+        const unsigned int num_iterations = max_window.num_iterations(hints.split_dimension());
+        const unsigned int num_threads    = std::min(num_iterations, _impl->_num_threads);
+
+        if(num_iterations == 0)
         {
-            case StrategyHint::STATIC:
-                num_windows = num_threads;
-                break;
-            case StrategyHint::DYNAMIC:
-            {
-                const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
-                // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
-                num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
-                break;
-            }
-            default:
-                ARM_COMPUTE_ERROR("Unknown strategy");
+            return;
         }
-        std::vector<IScheduler::Workload> workloads(num_windows);
-        for(unsigned int t = 0; t < num_windows; t++)
+
+        if(!kernel->is_parallelisable() || num_threads == 1)
         {
-            //Capture 't' by copy, all the other variables by reference:
-            workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+            ThreadInfo info;
+            info.cpu_info = &_cpu_info;
+            kernel->run(max_window, info);
+        }
+        else
+        {
+            unsigned int num_windows = 0;
+            switch(hints.strategy())
+            {
+                case StrategyHint::STATIC:
+                    num_windows = num_threads;
+                    break;
+                case StrategyHint::DYNAMIC:
+                {
+                    const unsigned int granule_threshold = (hints.threshold() <= 0) ? num_threads : static_cast<unsigned int>(hints.threshold());
+                    // Make sure we don't use some windows which are too small as this might create some contention on the ThreadFeeder
+                    num_windows = num_iterations > granule_threshold ? granule_threshold : num_iterations;
+                    break;
+                }
+                default:
+                    ARM_COMPUTE_ERROR("Unknown strategy");
+            }
+            std::vector<IScheduler::Workload> workloads(num_windows);
+            for(unsigned int t = 0; t < num_windows; t++)
             {
-                Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
-                win.validate();
-                kernel->run(win, info);
-            };
+                //Capture 't' by copy, all the other variables by reference:
+                workloads[t] = [t, &hints, &max_window, &num_windows, &kernel](const ThreadInfo & info)
+                {
+                    Window win = max_window.split_window(hints.split_dimension(), t, num_windows);
+                    win.validate();
+                    kernel->run(win, info);
+                };
+            }
+            run_workloads(workloads);
         }
-        run_workloads(workloads);
     }
 }
 } // namespace arm_compute
diff --git a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
index a3080e7f29..24bd7d7a8c 100644
--- a/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
+++ b/src/runtime/NEON/functions/NEGEMMAssemblyDispatch.cpp
@@ -280,8 +280,8 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::configure(const ITensor *a, c
     //if we disable this code below in brackets then ConvLayer deadlocks when threads > 1 and
     //the shapes are In=1x1x1024 Weights=1x1x1024x1001 Biases=1001 Out=1x1x1001
     {
-        const int window_size = _gemm_kernel_asm->get_window_size();
-        if(window_size < args._maxthreads)
+        const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
+        if(window_size < static_cast<unsigned int>(args._maxthreads))
         {
             _gemm_kernel_asm->set_nthreads(window_size);
         }
@@ -404,7 +404,7 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
     if(_workspace.buffer() != nullptr)
     {
         _gemm_kernel_asm->set_working_space(reinterpret_cast<void *>(_workspace.buffer()));
-        const unsigned int window_size = _gemm_kernel_asm->get_window_size();
+        const unsigned int window_size = get_total_window_size(*_gemm_kernel_asm);
         unsigned int       num_threads = NEScheduler::get().num_threads();
         if(window_size < num_threads)
         {
@@ -427,14 +427,21 @@ void Fallback<TypeInput, TypeOutput, OutputStage>::run()
                                  in1_ptr, ldb, multi_stride_b,
                                  out_ptr, ldd, batch_stride_d, multi_stride_d,
                                  bias, 0);
-
     // Schedule assembly kernel
     IScheduler::Hints scheduling_hint = IScheduler::Hints(Window::DimX);
     if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED && _d->info()->data_type() == DataType::F32)
     {
         const int granule_threshold = 200;
         scheduling_hint             = IScheduler::Hints(Window::DimX, IScheduler::StrategyHint::DYNAMIC, granule_threshold);
+
+    }
+    else if(_kernel_info.method == arm_gemm::GemmMethod::GEMM_INTERLEAVED_2D && _d->info()->data_type() == DataType::F32)
+    {
+        //GEMM_INTERLEAVED supports 2D parallelism, IScheduler::split_dimensions_all signals to parallelise over all window dimensions
+        const int granule_threshold = 200;
+        scheduling_hint             = IScheduler::Hints(IScheduler::split_dimensions_all, IScheduler::StrategyHint::STATIC, granule_threshold);
     }
+
     NEScheduler::get().schedule(_optimised_kernel.get(), scheduling_hint);
 }