From 5aa1a0b7ca5eed010e4b297a95b1c4851f741328 Mon Sep 17 00:00:00 2001
From: Georgios Pinitas <georgios.pinitas@arm.com>
Date: Thu, 2 Jul 2020 20:02:20 +0100
Subject: COMPID-3324: Clean GEMM kernels

Signed-off-by: Georgios Pinitas <georgios.pinitas@arm.com>
Change-Id: I170de1671e061a78740caee31fb4a1b8642c1369
Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/3505
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
---
 src/core/NEON/kernels/assembly/arm_gemm.hpp    | 106 +++++++++--------
 src/core/NEON/kernels/assembly/gemm_common.hpp | 150 +++++++++++------------
 src/core/NEON/kernels/assembly/ndrange.hpp     | 158 ++++++++++++++-----------
 3 files changed, 220 insertions(+), 194 deletions(-)

(limited to 'src/core/NEON/kernels/assembly')
diff --git a/src/core/NEON/kernels/assembly/arm_gemm.hpp b/src/core/NEON/kernels/assembly/arm_gemm.hpp
index 7723224ec8..2df7132500 100644
--- a/src/core/NEON/kernels/assembly/arm_gemm.hpp
+++ b/src/core/NEON/kernels/assembly/arm_gemm.hpp
@@ -23,14 +23,14 @@
  */
 #pragma once
 
-#include <memory>
 #include <cstring>
+#include <memory>
 
 #include "arm_gemm_local.hpp"
 #include "gemm_common.hpp"
 
-namespace arm_gemm {
-
+namespace arm_gemm
+{
 enum class GemmMethod
 {
     DEFAULT,
@@ -47,12 +47,17 @@ enum class GemmMethod
 
 struct KernelDescription
 {
-    GemmMethod   method      = GemmMethod::DEFAULT;
-    std::string  name        = "";
-    bool         is_default  = false;
+    GemmMethod  method     = GemmMethod::DEFAULT;
+    std::string name       = "";
+    bool        is_default = false;
 
-    KernelDescription(GemmMethod m, std::string n, bool d=false) : method(m), name(n), is_default(d) { }
-    KernelDescription() noexcept  { }
+    KernelDescription(GemmMethod m, std::string n, bool d = false)
+        : method(m), name(n), is_default(d)
+    {
+    }
+    KernelDescription() noexcept
+    {
+    }
 };
 
 struct GemmConfig
@@ -62,23 +67,32 @@ struct GemmConfig
     unsigned int inner_block_size = 0;
     unsigned int outer_block_size = 0;
 
-    GemmConfig(GemmMethod method) : method(method) { }
-    GemmConfig() { }
+    GemmConfig(GemmMethod method)
+        : method(method)
+    {
+    }
+    GemmConfig()
+    {
+    }
 };
 
 struct Activation
 {
-    enum class Type {
+    enum class Type
+    {
         None,
         ReLU,
         BoundedReLU
     };
 
-    Type    type;
-    float   param1;
-    float   param2;
+    Type  type;
+    float param1;
+    float param2;
 
-    Activation(Type type=Type::None, float p1=0.0f, float p2=0.0f) : type(type), param1(p1), param2(p2) { }
+    Activation(Type type = Type::None, float p1 = 0.0f, float p2 = 0.0f)
+        : type(type), param1(p1), param2(p2)
+    {
+    }
 };
 
 struct GemmArgs
@@ -101,10 +115,8 @@ public:
              const unsigned int K, const unsigned int nbatches,
              const unsigned int nmulti, const bool trA, const bool trB,
              Activation act, const int maxthreads,
-             const bool pretransposed_hint, const GemmConfig *cfg=nullptr ) :
-             _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
-             _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads),
-             _pretransposed_hint(pretransposed_hint), _cfg(cfg)
+             const bool pretransposed_hint, const GemmConfig *cfg = nullptr)
+        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _act(act), _maxthreads(maxthreads), _pretransposed_hint(pretransposed_hint), _cfg(cfg)
     {
     }
 };
@@ -112,18 +124,18 @@ public:
 struct Requantize32
 {
 public:
-    const int32_t  *bias = nullptr;
-    size_t          bias_multi_stride = 0;
-    int32_t         a_offset = 0;
-    int32_t         b_offset = 0;
-    int32_t         c_offset = 0;
-    bool            per_channel_requant = false;
-    int32_t         per_layer_shift = 0;
-    int32_t         per_layer_mul = 0;
-    const int32_t  *per_channel_shifts = nullptr;
-    const int32_t  *per_channel_muls = nullptr;
-    int32_t         minval = 0;
-    int32_t         maxval = 0;
+    const int32_t *bias                = nullptr;
+    size_t         bias_multi_stride   = 0;
+    int32_t        a_offset            = 0;
+    int32_t        b_offset            = 0;
+    int32_t        c_offset            = 0;
+    bool           per_channel_requant = false;
+    int32_t        per_layer_shift     = 0;
+    int32_t        per_layer_mul       = 0;
+    const int32_t *per_channel_shifts  = nullptr;
+    const int32_t *per_channel_muls    = nullptr;
+    int32_t        minval              = 0;
+    int32_t        maxval              = 0;
 
     Requantize32() = default;
 
@@ -131,11 +143,9 @@ public:
     Requantize32(const int32_t *bias, size_t bias_multi_stride,
                  int32_t a_offset, int32_t b_offset, int32_t c_offset,
                  int32_t requant_shift, int32_t requant_mul,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
-        minval(minv), maxval(maxv)
+                 int32_t minv, int32_t maxv)
+        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(false), per_layer_shift(requant_shift), per_layer_mul(requant_mul),
+          minval(minv), maxval(maxv)
     {
     }
 
@@ -143,11 +153,9 @@ public:
     Requantize32(const int32_t *bias, size_t bias_multi_stride,
                  int32_t a_offset, int32_t b_offset, int32_t c_offset,
                  const int32_t *requant_shifts, const int32_t *requant_muls,
-                 int32_t minv, int32_t maxv) :
-        bias(bias), bias_multi_stride(bias_multi_stride),
-        a_offset(a_offset), b_offset(b_offset), c_offset(c_offset),
-        per_channel_requant(true), per_channel_shifts(requant_shifts), per_channel_muls(requant_muls),
-        minval(minv), maxval(maxv)
+                 int32_t minv, int32_t maxv)
+        : bias(bias), bias_multi_stride(bias_multi_stride), a_offset(a_offset), b_offset(b_offset), c_offset(c_offset), per_channel_requant(true), per_channel_shifts(requant_shifts),
+          per_channel_muls(requant_muls), minval(minv), maxval(maxv)
     {
     }
 };
@@ -156,21 +164,21 @@ struct Nothing
 {
 };
 
-template<typename Top, typename Tret>
-using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret> >;
+template <typename Top, typename Tret>
+using UniqueGemmCommon = std::unique_ptr<GemmCommon<Top, Tret>>;
 
 /* Low level API calls.
  * These are implemented as 'GemmArgs' versions, or with the arguments explicitly listed. */
 
 /* get_gemm_method(): Given the templated types and provided parameters,
  * which is the preferred method to implement this GEMM?  */
-template<typename Top, typename Tret, class OutputStage = Nothing>
-KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & ={});
+template <typename Top, typename Tret, class OutputStage = Nothing>
+KernelDescription get_gemm_method(const GemmArgs &args, const OutputStage & = {});
 
-template<typename Top, typename Tret, class OutputStage = Nothing>
-UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & ={});
+template <typename Top, typename Tret, class OutputStage = Nothing>
+UniqueGemmCommon<Top, Tret> gemm(const GemmArgs &args, const OutputStage & = {});
 
-template<typename Top, typename Tret, class OutputStage = Nothing>
-std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & ={});
+template <typename Top, typename Tret, class OutputStage = Nothing>
+std::vector<KernelDescription> get_compatible_kernels(const GemmArgs &args, const OutputStage & = {});
 
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/assembly/gemm_common.hpp b/src/core/NEON/kernels/assembly/gemm_common.hpp
index a44b774b9d..3b4c025371 100644
--- a/src/core/NEON/kernels/assembly/gemm_common.hpp
+++ b/src/core/NEON/kernels/assembly/gemm_common.hpp
@@ -23,15 +23,12 @@
  */
 #pragma once
 
-#include "arm_gemm_compute_iface.hpp"
+#include "ndrange.hpp"
 
 #include <cstddef>
-#include <cassert>
-
-#define UNUSED(x)   (void)(x)
-
-namespace arm_gemm {
 
+namespace arm_gemm
+{
 // Abstract class for the GEMM/GEMV functions.
 //
 // GEMM implementations may be "native" (never require any input
@@ -41,7 +38,8 @@ namespace arm_gemm {
 
 // The real GemmCommon class is templated based on the operand and return
 // type.  This is an interface class which is independent of those types.
-class IGemmCommon {
+class IGemmCommon
+{
 public:
     /* Pass in the pointers to the arrays to be operated on and their
      * strides.  This "generic" version uses void *s, the preferred version
@@ -50,9 +48,9 @@ public:
      * the settings for B here are ignored.
      */
     virtual void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                                    const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                          void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                                    const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) = 0;
+                                    const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+                                    void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+                                    const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) = 0;
 
     /** @returns an ndrange containing ranges of the compute space which can be
      * broken up and parallelised over
@@ -71,47 +69,64 @@ public:
      * This has an empty default implementation, as GEMMs which don't care
      * about thread count can safely ignore this.
      */
-    virtual void set_nthreads(int) { };
+    virtual void set_nthreads(int) {};
 
     /* Whether this GEMM can be dynamically scheduled or not. */
-    virtual bool supports_dynamic_scheduling() const { return false; }
+    virtual bool supports_dynamic_scheduling() const
+    {
+        return false;
+    }
 
     /** Main execute member fucntion
      * @param [in] work_range     specifies the range of work we want to be computed, total range defined by get_window_size()
      * @param [in] thread_locator where are we inside of the thread space
      * @naram [in] threadid       a unique threadid
      */
-    virtual void execute(const ndcoord_t& work_range, const ndcoord_t& thread_locator, int threadid) = 0;
+    virtual void execute(const ndcoord_t &work_range, const ndcoord_t &thread_locator, int threadid) = 0;
 
     /*** Working space interface (optional) ***/
     /* Total number of bytes of temporary working space needed.  If zero, it's not necessary to call set_working_space(). */
-    virtual size_t get_working_size() const { return 0; }
+    virtual size_t get_working_size() const
+    {
+        return 0;
+    }
     /* Provide working space buffer - the void * passed in must remain allocated for the duration of any execute calls. */
-    virtual void set_working_space(void *) { };
+    virtual void set_working_space(void *) {};
 
     /*** "Pretransposed" interface (optional) ***/
     /* Is this object set up for pretranspose?  If so, pretranspose_array() needs to be called before execute(); */
-    virtual bool B_is_pretransposed() const { return false; }
+    virtual bool B_is_pretransposed() const
+    {
+        return false;
+    }
     /* Does pretranspose still need to be done? */
-    virtual bool B_pretranspose_required() const { return false; }
+    virtual bool B_pretranspose_required() const
+    {
+        return false;
+    }
     /* Total number of bytes of space needed for pretransposed arrays. */
-    virtual size_t get_B_pretransposed_array_size() const { return 0; }
+    virtual size_t get_B_pretransposed_array_size() const
+    {
+        return 0;
+    }
     /* Perform pretranspose - arguments are output, input, input row stride and input multi stride. */
     /* The "real" version of this depends on the templated operand type (see below).  */
     virtual void pretranspose_B_array_generic(void *, const void *, const int, const int) = 0;
     /* Set pretransposed data - the void * passed in must previously have been passed to pretranspose_B_array() for the same or a similar GEMM. */
-    virtual void set_pretransposed_B_data(void *) { }
+    virtual void set_pretransposed_B_data(void *)
+    {
+    }
 
     /*** "Quantized bias" interface (optional) ***/
     /* Set the bias vector for quantized GEMMs */
-    virtual void set_quantized_bias(const int32_t *bias, size_t bias_multi_stride)
+    virtual void set_quantized_bias(const int32_t *, size_t)
     {
-        UNUSED(bias);
-        UNUSED(bias_multi_stride);
     }
 
     // Destructor
-    virtual ~IGemmCommon() { }
+    virtual ~IGemmCommon()
+    {
+    }
 };
 
 /* "Real" GemmCommon class which is templated on the operand and return types.
@@ -121,50 +136,53 @@ public:
  * 'set_arrays' to capture the provided arguments in protected class
  * members, as essentially any implementation will need these.
  */
-template<typename To, typename Tr>
-class GemmCommon : public IGemmCommon {
+template <typename To, typename Tr>
+class GemmCommon : public IGemmCommon
+{
 protected:
-    const To *_Aptr=nullptr;
-    int _lda=0;
-    int _A_batch_stride=0;
-    int _A_multi_stride=0;
-    const To *_Bptr=nullptr;
-    int _ldb=0;
-    int _B_multi_stride=0;
-    Tr *_Cptr=nullptr;
-    int _ldc=0;
-    int _C_batch_stride=0;
-    int _C_multi_stride=0;
-    const Tr *_bias=nullptr;
-    int _bias_multi_stride=0;
+    const To *_Aptr              = nullptr;
+    int       _lda               = 0;
+    int       _A_batch_stride    = 0;
+    int       _A_multi_stride    = 0;
+    const To *_Bptr              = nullptr;
+    int       _ldb               = 0;
+    int       _B_multi_stride    = 0;
+    Tr       *_Cptr              = nullptr;
+    int       _ldc               = 0;
+    int       _C_batch_stride    = 0;
+    int       _C_multi_stride    = 0;
+    const Tr *_bias              = nullptr;
+    int       _bias_multi_stride = 0;
 
 public:
     /* Pass in the pointers to the arrays to be operated on and their
      * strides (templated version with appropriate types). */
     virtual void set_arrays(const To *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const To *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const Tr *bias, /* no row or batch stride needed */   const int bias_multi_stride) {
-        _Aptr = A;
-        _lda = lda;
-        _A_batch_stride = A_batch_stride;
-        _A_multi_stride = A_multi_stride;
-        _Bptr = B;
-        _ldb = ldb;
-        _B_multi_stride = B_multi_stride;
-        _Cptr = C;
-        _ldc = ldc;
-        _C_batch_stride = C_batch_stride;
-        _C_multi_stride = C_multi_stride;
-        _bias = bias;
+                            const To *B, const int ldb, /* batches share B */ const int B_multi_stride,
+                            Tr *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+                            const Tr *bias, /* no row or batch stride needed */ const int bias_multi_stride)
+    {
+        _Aptr              = A;
+        _lda               = lda;
+        _A_batch_stride    = A_batch_stride;
+        _A_multi_stride    = A_multi_stride;
+        _Bptr              = B;
+        _ldb               = ldb;
+        _B_multi_stride    = B_multi_stride;
+        _Cptr              = C;
+        _ldc               = ldc;
+        _C_batch_stride    = C_batch_stride;
+        _C_multi_stride    = C_multi_stride;
+        _bias              = bias;
         _bias_multi_stride = bias_multi_stride;
     }
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
     void set_arrays_generic(const void *A, const int lda, const int A_batch_stride, const int A_multi_stride,
-                            const void *B, const int ldb, /* batches share B */     const int B_multi_stride,
-                                  void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
-                            const void *bias, /* no row or batch stride needed */   const int bias_multi_stride) override {
+                            const void *B, const int ldb, /* batches share B */ const int B_multi_stride,
+                            void *C, const int ldc, const int C_batch_stride, const int C_multi_stride,
+                            const void *bias, /* no row or batch stride needed */ const int bias_multi_stride) override
+    {
         set_arrays(static_cast<const To *>(A), lda, A_batch_stride, A_multi_stride,
                    static_cast<const To *>(B), ldb, B_multi_stride,
                    static_cast<Tr *>(C), ldc, C_batch_stride, C_multi_stride,
@@ -175,27 +193,13 @@ public:
 
     /* Perform pretranspose - the void * passed in must remain allocated for the duration of any execute calls. */
     /* Arguments are: output buffer pointer, source pointer, source row stride, source multi stride */
-    virtual void pretranspose_B_array(void *, const To *, const int, const int) { };
+    virtual void pretranspose_B_array(void *, const To *, const int, const int) {};
 
     /* Implementation of the void * overload which casts its arguments to the appropriate type. */
-    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override {
+    void pretranspose_B_array_generic(void *out, const void *in, const int row_stride, const int multi_stride) override
+    {
         pretranspose_B_array(out, static_cast<const To *>(in), row_stride, multi_stride);
     }
 };
 
-template<typename GemmKernel>
-inline
-int unsigned get_total_window_size(const GemmKernel& kernel)
-{
-    auto window=kernel.get_window_size();
-
-    unsigned int total = 1;
-    for(unsigned i = 0; i != arm_gemm::ndrange_max; ++i)
-    {
-        total *= window.get_size(i);
-    }
-
-    return total;
-}
-
 } // namespace arm_gemm
diff --git a/src/core/NEON/kernels/assembly/ndrange.hpp b/src/core/NEON/kernels/assembly/ndrange.hpp
index d082a3e9b8..86638298ab 100644
--- a/src/core/NEON/kernels/assembly/ndrange.hpp
+++ b/src/core/NEON/kernels/assembly/ndrange.hpp
@@ -23,104 +23,123 @@
  */
 #pragma once
 
-#include <array>
 #include <algorithm>
-#include <initializer_list>
-
+#include <array>
 #include <cassert>
+#include <initializer_list>
 
-namespace arm_gemm {
-
-template<unsigned int D>
-class NDRange {
+namespace arm_gemm
+{
+template <unsigned int D>
+class NDRange
+{
 private:
-    std::array<unsigned int, D> m_sizes {};
-    std::array<unsigned int, D> m_totalsizes {};
+    std::array<unsigned int, D> m_sizes{};
+    std::array<unsigned int, D> m_totalsizes{};
 
-    class NDRangeIterator {
+    class NDRangeIterator
+    {
     private:
         const NDRange &m_parent;
-        unsigned int m_pos = 0;
-        unsigned int m_end = 0;
+        unsigned int   m_pos = 0;
+        unsigned int   m_end = 0;
 
     public:
-        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e) : m_parent(p), m_pos(s), m_end(e) { }
+        NDRangeIterator(const NDRange &p, unsigned int s, unsigned int e)
+            : m_parent(p), m_pos(s), m_end(e)
+        {
+        }
 
-        bool done() const {
+        bool done() const
+        {
             return (m_pos >= m_end);
         }
 
-        unsigned int dim(unsigned int d) const {
+        unsigned int dim(unsigned int d) const
+        {
             unsigned int r = m_pos;
 
-            if (d < (D - 1)) {
+            if(d < (D - 1))
+            {
                 r %= m_parent.m_totalsizes[d];
             }
 
-            if (d > 0) {
-                r /= m_parent.m_totalsizes[d-1];
+            if(d > 0)
+            {
+                r /= m_parent.m_totalsizes[d - 1];
             }
 
             return r;
         }
 
-        bool next_dim0() {
+        bool next_dim0()
+        {
             m_pos++;
 
             return !done();
         }
 
-        bool next_dim1() {
+        bool next_dim1()
+        {
             m_pos += m_parent.m_sizes[0] - dim(0);
 
             return !done();
         }
 
-        unsigned int dim0_max() const {
+        unsigned int dim0_max() const
+        {
             unsigned int offset = std::min(m_end - m_pos, m_parent.m_sizes[0] - dim(0));
 
             return dim(0) + offset;
         }
     };
 
-public:
-    NDRange& operator=(const NDRange& rhs)=default;
-    NDRange(const NDRange& rhs)           =default;
-
-    template <typename... T>
-    NDRange(T... ts)
-    : m_sizes{ts...}
+    void set_totalsizes()
     {
-        unsigned int t=1;
+        unsigned int t = 1;
+
+        for(unsigned int i = 0; i < D; i++)
+        {
+            if(m_sizes[i] == 0)
+            {
+                m_sizes[i] = 1;
+            }
 
-        for (unsigned int i=0; i<D; i++) {
             t *= m_sizes[i];
 
             m_totalsizes[i] = t;
         }
     }
 
-    NDRange(const std::array<unsigned int, D>& n)
-    : m_sizes(n)
-    {
-        unsigned int t=1;
+public:
+    NDRange &operator=(const NDRange &rhs) = default;
+    NDRange(const NDRange &rhs)            = default;
 
-        for (unsigned int i=0; i<D; i++) {
-            t *= m_sizes[i];
+    template <typename... T>
+    NDRange(T... ts)
+        : m_sizes{ ts... }
+    {
+        set_totalsizes();
+    }
 
-            m_totalsizes[i] = t;
-        }
+    NDRange(const std::array<unsigned int, D> &n)
+        : m_sizes(n)
+    {
+        set_totalsizes();
     }
 
-    NDRangeIterator iterator(unsigned int start, unsigned int end) const {
+    NDRangeIterator iterator(unsigned int start, unsigned int end) const
+    {
         return NDRangeIterator(*this, start, end);
     }
 
-    unsigned int total_size() const {
+    unsigned int total_size() const
+    {
         return m_totalsizes[D - 1];
     }
 
-    unsigned int get_size(unsigned int v) const {
+    unsigned int get_size(unsigned int v) const
+    {
         return m_sizes[v];
     }
 };
@@ -128,58 +147,53 @@ public:
 /** NDCoordinate builds upon a range, but specifies a starting position
  * in addition to a size which it inherits from NDRange
  */
-template<unsigned int N>
-class NDCoordinate : public NDRange<N> {
-    using int_t     =unsigned int;
+template <unsigned int N>
+class NDCoordinate : public NDRange<N>
+{
+    using int_t     = unsigned int;
     using ndrange_t = NDRange<N>;
 
-    std::array<int_t, N> m_positions {};
+    std::array<int_t, N> m_positions{};
+
 public:
-    NDCoordinate& operator=(const NDCoordinate& rhs)=default;
-    NDCoordinate(const NDCoordinate& rhs)           =default;
-    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>>& list)
+    NDCoordinate &operator=(const NDCoordinate &rhs) = default;
+    NDCoordinate(const NDCoordinate &rhs)            = default;
+    NDCoordinate(const std::initializer_list<std::pair<int_t, int_t>> &list)
     {
         std::array<int_t, N> sizes{};
 
         std::size_t i = 0;
-        for(auto& p : list) {
-            m_positions[i]= p.first;
-            sizes[i++]    = p.second;
+        for(auto &p : list)
+        {
+            m_positions[i] = p.first;
+            sizes[i++]     = p.second;
         }
 
         //update the parents sizes
-        static_cast<ndrange_t&>(*this) = ndrange_t(sizes);
+        static_cast<ndrange_t &>(*this) = ndrange_t(sizes);
     }
 
-    int_t get_position(int_t d) const {
-        assert(d < m_positions.size());
+    int_t get_position(int_t d) const
+    {
+        assert(d < N);
+
         return m_positions[d];
     }
 
-    void set_position(int_t d, int_t v) {
-        assert(d < size(m_positions));
-        assert(v < ndrange_t::get_size(d));
+    void set_position(int_t d, int_t v)
+    {
+        assert(d < N);
 
         m_positions[d] = v;
     }
 
-    int_t get_position_end(int_t d) const {
-        return get_position(d) + NDRange<N>::get_size(d);
+    int_t get_position_end(int_t d) const
+    {
+        return get_position(d) + ndrange_t::get_size(d);
     }
 }; //class NDCoordinate
 
-/** @returns the number of dimensions in the NDRange which have none-1 values
- * IE there is actual work in these dimensions that can be broken up
- */
-template<unsigned int N>
-std::size_t ndrange_popcount(const NDRange<N>& ndr) {
-    std::size_t count = 0;
-
-    for(unsigned int d = 0; d != N; ++d) {
-        if(ndr.get_size(d) != 1)
-            ++count;
-    }
-    return count;
-}
+using ndrange_t = NDRange<6>;
+using ndcoord_t = NDCoordinate<6>;
 
 } // namespace arm_gemm
-- 
cgit v1.2.1