From 5f707736413aeac77818c42838296966f8dc6761 Mon Sep 17 00:00:00 2001
From: Anthony Barbier <anthony.barbier@arm.com>
Date: Tue, 3 Jul 2018 16:22:02 +0100
Subject: COMPMID-1369: Revert accidental formatting of RSH's repo

Pulled latest fixes from David's repo:

commit f43ebe932c84083332b0b1a0348241b69dda63a7
Author: David Mansell <David.Mansell@arm.com>
Date:   Tue Jul 3 18:09:01 2018 +0100

    Whitespace tidying, fixed comment in gemv_batched imported from ACL.

Change-Id: Ie37a623f44e90d88072236cb853ac55ac82d5f51
Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/138530
Tested-by: Jenkins <bsgcomp@arm.com>
Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com>
Reviewed-by: Gian Marco Iodice <gianmarco.iodice@arm.com>
Reviewed-by: David Mansell <david.mansell@arm.com>
Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
---
 .../NEON/kernels/arm_gemm/gemm_interleaved.hpp     | 383 ++++++++-------------
 1 file changed, 151 insertions(+), 232 deletions(-)

(limited to 'src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp')
diff --git a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
index 32c65cd3fb..c304edd1f9 100644
--- a/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
+++ b/src/core/NEON/kernels/arm_gemm/gemm_interleaved.hpp
@@ -23,8 +23,8 @@
  */
 #pragma once
 
-#include <assert.h>
 #include <stdio.h>
+#include <assert.h>
 
 #include <algorithm>
 
@@ -41,23 +41,22 @@
 
 // Some macros used to decide how much working space to allocate.
 // Round allocations up to the next cache line.
-#define ALLOC_ROUND 64
-#define ROUND_UP(x) ((((x) + ALLOC_ROUND - 1) / ALLOC_ROUND) * ALLOC_ROUND)
+#define ALLOC_ROUND	64
+#define ROUND_UP(x)	((((x) + ALLOC_ROUND-1) / ALLOC_ROUND) * ALLOC_ROUND)
 
 // Implementation of the GemmCommon abstract class.
 //
 // This implementation interleaves the source matrices in blocks - good for
 // larger matrices.
-namespace arm_gemm
-{
-template <typename strategy, typename To, typename Tr>
-class GemmInterleaved : public GemmCommon<To, Tr>
-{
+namespace arm_gemm {
+
+template<typename strategy, typename To, typename Tr>
+class GemmInterleaved : public GemmCommon<To, Tr> {
     typedef typename strategy::operand_type Toi;
-    typedef typename strategy::result_type  Tri;
+    typedef typename strategy::result_type Tri;
 
     /* const properties set by constructor */
-    const CPUInfo *const _ci;
+    const CPUInfo * const _ci;
 
     const unsigned int _Msize;
     const unsigned int _Nsize;
@@ -72,173 +71,138 @@ class GemmInterleaved : public GemmCommon<To, Tr>
     const Tr _alpha;
     const Tr _beta;
 
-    const unsigned int _maxthreads;
-    const bool         _pretransposed;
+    const int _maxthreads;
+    int _nthreads;
+    const bool _pretransposed;
 
     /* Blocking info */
-    unsigned int _k_block = 0;
-    unsigned int _x_block = 0;
-    unsigned int _Mround  = 0;
+    unsigned int _k_block=0;
+    unsigned int _x_block=0;
+    unsigned int _Mround=0;
 
     /* Working space, pretransposed buffer, buffer manager */
-    const Toi     *_B_transposed  = nullptr;
-    BufferManager *_bm            = nullptr;
-    void          *_working_space = nullptr;
+    const Toi *_B_transposed=nullptr;
+    BufferManager *_bm=nullptr;
+    void *_working_space=nullptr;
 
     /* We will need to walk through the blocks of B in a few contexts, so
      * factor that out.  */
-    class blockwalker
-    {
+    class blockwalker {
     private:
         /* Size loops, etc. based on our parent's configuration */
         const GemmInterleaved<strategy, To, Tr> &_parent;
 
-        /* K and X and multi parameters for current iteration. */
-        unsigned int _k0 = 0, _x0 = 0, _multi = 0;
+        /* K, X and multi parameters for current iteration. */
+        unsigned int _k0=0, _x0=0, _multi=0;
 
-        unsigned int _index     = 0;
-        bool         _done      = false;
-        bool         _newkblock = true;
-        bool         _newmulti  = true;
+        unsigned int _index=0;
+        bool _done=false;
+        bool _newkblock=true;
+        bool _newmulti=true;
 
     public:
-        blockwalker(const GemmInterleaved<strategy, To, Tr> &parent)
-            : _parent(parent)
-        {
-        }
+        blockwalker(const GemmInterleaved<strategy, To, Tr> &parent) : _parent(parent) { }
 
-        unsigned int xmax()
-        {
+        unsigned int xmax() {
             return std::min(_x0 + _parent._x_block, _parent._Nsize);
         }
 
-        unsigned int kmax()
-        {
+        unsigned int kmax() {
             return std::min(_k0 + _parent._k_block, _parent._Ksize);
         }
 
         /* Advance to the next block, return false at the end. */
-        bool advance(void)
-        {
-            if(_done)
-            {
+        bool advance(void) {
+            if (_done) {
                 return false;
             }
 
-            _newkblock = false;
+            _newkblock=false;
             _x0 += _parent._x_block;
-            if(_x0 >= _parent._Nsize)
-            {
-                _x0 = 0;
+            if (_x0 >= _parent._Nsize) {
+                _x0=0;
                 _k0 += _parent._k_block;
-                if(_k0 >= _parent._Ksize)
-                {
-                    _k0 = 0;
+                if (_k0 >= _parent._Ksize) {
+                    _k0=0;
                     _multi++;
-                    if(_multi >= _parent._nmulti)
-                    {
-                        _done = true;
+                    if (_multi >= _parent._nmulti) {
+                        _done=true;
                         return false;
                     }
-                    _newmulti = true;
+                    _newmulti=true;
                 }
-                _newkblock = true;
+                _newkblock=true;
             }
             _index++;
 
             return true;
         }
 
-        unsigned int k0(void)
-        {
-            return _k0;
-        }
-        unsigned int x0(void)
-        {
-            return _x0;
-        }
-        unsigned int multi(void)
-        {
-            return _multi;
-        }
-        unsigned int index(void)
-        {
-            return _index;
-        }
-        bool done(void)
-        {
-            return _done;
-        }
-        bool newkblock(void)
-        {
-            return _newkblock;
-        }
+        unsigned int k0(void) { return _k0; }
+        unsigned int x0(void) { return _x0; }
+        unsigned int multi(void) { return _multi; }
+        unsigned int index(void) { return _index; }
+        bool done(void) { return _done; }
+        bool newkblock(void) { return _newkblock; }
     };
 
     // A working size: One of these needed, regardless of thread count.  Divided according to window.
-    size_t get_a_working_size() const
-    {
+    size_t get_a_working_size() const {
         return ROUND_UP(sizeof(Toi) * _k_block * _Mround * _nbatches);
     }
 
     // B working size: 0, 1 or 3 of these needed depending on pretransposed and threading settings.
-    size_t get_b_working_size() const
-    {
+    size_t get_b_working_size() const {
         return ROUND_UP(sizeof(Toi) * _x_block * _k_block);
     }
 
     // C working size: One needed per thread.
-    size_t get_c_working_size() const
-    {
+    size_t get_c_working_size() const {
         return ROUND_UP(sizeof(Tri) * _x_block * strategy::out_height);
     }
 
     // Internal execute function.
     // This supports both the "pretransposed" and "standard" interfaces via the template parameter.
-    template <bool pretransposed>
-    void execute_internal(unsigned int start, unsigned int end, int threadid)
-    {
+    template<bool pretransposed>
+    void execute_internal(unsigned int start, unsigned int end, int threadid) {
 #ifdef CYCLE_PROFILING
         profiler prof;
 #endif
-
         strategy strat(_ci);
 
         blockwalker current(*this);
-        blockwalker next = current;
+        blockwalker next=current;
 
         /* Translate 'start' and 'end' into a position within the batches and rows. */
         const unsigned int window_per_batch = _Mround / strategy::out_height;
-        unsigned int       batch_0          = start / window_per_batch;
-        unsigned int       batch_end        = end / window_per_batch;
+        unsigned int batch_0   = start / window_per_batch;
+        unsigned int batch_end = end   / window_per_batch;
 
         /* Compute the M values to operate on */
         unsigned int m_0   = (start - (batch_0 * window_per_batch)) * strategy::out_height;
         unsigned int m_max = (end - (batch_end * window_per_batch)) * strategy::out_height;
 
         /* Make sure we've been set up correctly. */
-        if(pretransposed)
-        {
+        if (pretransposed) {
             assert(_B_transposed);
-        }
-        else
-        {
+        } else {
             assert(_bm);
         }
 
         assert(_working_space);
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(_working_space);
 
-        // Private buffers.  Treat working_space as an array of C buffers (one per thread) first, followed by the (window-divided) A buffer.
+        // Private buffers.  Treat working_space as an array of C buffers
+        // (one per thread) first, followed by the (window-divided) A
+        // buffer.
         // Set a_panel to the base of the A buffers - compute offsets into it based on M/batches later.
-        Toi *const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
-        Tri *const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
+        Toi * const a_panel = reinterpret_cast<Toi *>(working_space_bytes + (_maxthreads * get_c_working_size()));
+        Tri * const c_panel = reinterpret_cast<Tri *>(working_space_bytes + (threadid * get_c_working_size()));
 
         // Shared buffers - these come either from BufferManager or _B_transposed.
         const Toi *b_panel;
 
-        if(pretransposed)
-        {
+        if (pretransposed) {
             b_panel = _B_transposed;
         }
 
@@ -247,33 +211,28 @@ class GemmInterleaved : public GemmCommon<To, Tr>
         // newkblock() is always true on the first iteration, so this will be set properly on the first loop.
         int kern_k = 0;
 
-        for(; !current.done(); current.advance())
-        {
-            if(current.newkblock())
-            {
+        for (;!current.done();current.advance()) {
+            if (current.newkblock()) {
 #ifdef CYCLE_PROFILING
-                auto p = prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height * (current.kmax() - current.k0()) * sizeof(Toi));
+                auto p=prof.ScopedProfiler(PROFILE_PREPA, (end - start) * strategy::out_height * (current.kmax()-current.k0()) * sizeof(Toi));
 #endif
-                for(unsigned int batch = batch_0; batch <= batch_end; batch++)
-                {
-                    unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+                for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                    unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
                     unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
 
-                    if(first_m >= last_m)
+                    if (first_m >= last_m)
                         continue;
-                    if(_trA ^ strategy::A_transpose)
-                    {
+
+                    if (_trA ^ strategy::A_transpose) {
                         Transform<strategy::A_interleave, strategy::A_block, true>(
-                            a_panel + ((batch * _Mround + first_m) * _k_block),
-                            this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
-                            this->_lda, first_m, last_m, current.k0(), current.kmax());
-                    }
-                    else
-                    {
+                                   a_panel + ((batch * _Mround + first_m) * _k_block),
+                                   this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                   this->_lda, first_m, last_m, current.k0(), current.kmax());
+                    } else {
                         Transform<strategy::A_interleave, strategy::A_block, false>(
-                            a_panel + ((batch * _Mround + first_m) * _k_block),
-                            this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
-                            this->_lda, first_m, last_m, current.k0(), current.kmax());
+                                   a_panel + ((batch * _Mround + first_m) * _k_block),
+                                   this->_Aptr + (batch * this->_A_batch_stride) + (current.multi() * this->_A_multi_stride),
+                                   this->_lda, first_m, last_m, current.k0(), current.kmax());
                     }
                 }
 
@@ -284,8 +243,7 @@ class GemmInterleaved : public GemmCommon<To, Tr>
 
             int bblocks = iceildiv(current.xmax() - current.x0(), strategy::out_width);
 
-            if(!pretransposed)
-            {
+            if (!pretransposed) {
                 /* Look ahead to the next block and populate it if necessary.
                  * This avoids the populate operation becoming a bottleneck, and
                  * helps keep the threads synchronized (the first thread to get
@@ -294,71 +252,60 @@ class GemmInterleaved : public GemmCommon<To, Tr>
                  * If we are running single threaded, bm->try_populate() will do
                  * nothing.
                  */
-                if(next.advance())
-                {
-                    _bm->try_populate(next.index(), [&](void *buffer)
-                    {
+                if (next.advance()) {
+                    _bm->try_populate(next.index(), [&](void *buffer) {
 #ifdef CYCLE_PROFILING
-                        auto p = prof.ScopedProfiler(PROFILE_PREPB, (next.xmax() - next.x0()) * (next.kmax() - next.k0()) * sizeof(Toi));
+                        auto p=prof.ScopedProfiler(PROFILE_PREPB, (next.xmax()-next.x0()) * (next.kmax()-next.k0()) * sizeof(Toi));
 #endif
 
                         Toi *b_panel = reinterpret_cast<Toi *>(buffer);
-                        if(_trB ^ strategy::B_transpose)
-                        {
+                        if (_trB ^ strategy::B_transpose) {
                             Transform<strategy::B_interleave, strategy::B_block, true>(
-                                b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
-                                next.x0(), next.xmax(), next.k0(), next.kmax());
-                        }
-                        else
-                        {
+                                       b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+                                       next.x0(), next.xmax(), next.k0(), next.kmax());
+                        } else {
                             Transform<strategy::B_interleave, strategy::B_block, false>(
-                                b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
-                                next.x0(), next.xmax(), next.k0(), next.kmax());
+                                       b_panel, this->_Bptr + (next.multi() * this->_B_multi_stride), this->_ldb,
+                                       next.x0(), next.xmax(), next.k0(), next.kmax());
                         }
                     });
                 }
+
                 /* Get the buffer for this iteration from the BufferManager. */
-                b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv)
-                {
+                b_panel = reinterpret_cast<Toi *>(_bm->get(current.index(), [&](void *bpv) {
 #ifdef CYCLE_PROFILING
-                    auto p = prof.ScopedProfiler(PROFILE_PREPB, (current.xmax() - current.x0()) * (current.kmax() - current.k0()) * sizeof(Toi));
+                    auto p=prof.ScopedProfiler(PROFILE_PREPB, (current.xmax()-current.x0()) * (current.kmax()-current.k0()) * sizeof(Toi));
 #endif
 
                     Toi *b_panel = reinterpret_cast<Toi *>(bpv);
-                    if(_trB ^ strategy::B_transpose)
-                    {
+                    if (_trB ^ strategy::B_transpose) {
                         Transform<strategy::B_interleave, strategy::B_block, true>(
-                            b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
-                            current.x0(), current.xmax(), current.k0(), current.kmax());
-                    }
-                    else
-                    {
+                                   b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+                                   current.x0(), current.xmax(), current.k0(), current.kmax());
+                    } else {
                         Transform<strategy::B_interleave, strategy::B_block, false>(
-                            b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
-                            current.x0(), current.xmax(), current.k0(), current.kmax());
+                                   b_panel, this->_Bptr + (current.multi() * this->_B_multi_stride), this->_ldb,
+                                   current.x0(), current.xmax(), current.k0(), current.kmax());
                     }
-
                 }));
             }
 
             /* Do the actual work. */
-            for(unsigned int batch = batch_0; batch <= batch_end; batch++)
-            {
-                unsigned int first_m = (batch == batch_0) ? m_0 : 0;
+            for (unsigned int batch = batch_0; batch <= batch_end; batch++) {
+                unsigned int first_m = (batch == batch_0)   ? m_0   : 0;
                 unsigned int last_m  = (batch == batch_end) ? m_max : _Msize;
 
                 const Toi *a_ptr = a_panel + (batch * _Mround + first_m) * _k_block;
 
-                if(first_m >= last_m)
+                if (first_m >= last_m)
                     continue;
 
-                for(unsigned int y = first_m; y < last_m; y += strategy::out_height)
-                {
+                for (unsigned int y=first_m; y<last_m; y+=strategy::out_height) {
                     unsigned int ymax = std::min(_Msize, y + strategy::out_height);
 
                     {
 #ifdef CYCLE_PROFILING
-                        auto p = prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k));
+                        auto p=prof.ScopedProfiler(PROFILE_KERNEL, (strategy::out_height * bblocks * strategy::out_width * kern_k));
 #endif
 
                         strat.kernel(a_ptr, b_panel, c_panel, 1, bblocks, kern_k);
@@ -368,22 +315,19 @@ class GemmInterleaved : public GemmCommon<To, Tr>
 
                     {
 #ifdef CYCLE_PROFILING
-                        auto p = prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)));
+                        auto p=prof.ScopedProfiler(PROFILE_MERGE, (strategy::out_height * bblocks * strategy::out_width * sizeof(Tr)));
 #endif
                         MergeResults<strategy::out_width, strategy::out_height>(
-                            this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
-                            c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
-                            _alpha, (current.k0() == 0 ? _beta : static_cast<Tr>(1)));
+                                      this->_Cptr + (batch * this->_C_batch_stride) + (current.multi() * this->_C_multi_stride),
+                                      c_panel, this->_ldc, y, ymax, current.x0(), current.xmax(),
+                                      _alpha, (current.k0()==0 ? _beta : static_cast<Tr>(1)));
                     }
                 }
             }
 
-            if(pretransposed)
-            {
+            if (pretransposed) {
                 b_panel += (bblocks * strat.out_width * kern_k);
-            }
-            else
-            {
+            } else {
                 _bm->release(current.index());
             }
         }
@@ -391,14 +335,15 @@ class GemmInterleaved : public GemmCommon<To, Tr>
 
 public:
     GemmInterleaved(GemmInterleaved &) = delete;
-    GemmInterleaved &operator=(GemmInterleaved &) = delete;
+    GemmInterleaved & operator= (GemmInterleaved &) = delete;
 
     /* Constructor */
     GemmInterleaved(const CPUInfo *ci, const unsigned int M, const unsigned int N, const unsigned int K,
                     const unsigned int nbatches, const unsigned int nmulti, const bool trA, const bool trB,
-                    const Tr alpha, const Tr beta, const int maxthreads, const bool pretransposed)
-        : _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti), _trA(trA), _trB(trB), _alpha(alpha), _beta(beta), _maxthreads(maxthreads), _pretransposed(pretransposed)
-    {
+                    const Tr alpha, const Tr beta, const int maxthreads, const bool pretransposed) :
+                    _ci(ci), _Msize(M), _Nsize(N), _Ksize(K), _nbatches(nbatches), _nmulti(nmulti),
+                    _trA(trA), _trB(trB), _alpha(alpha), _beta(beta),
+                    _maxthreads(maxthreads), _nthreads(maxthreads), _pretransposed(pretransposed)  {
         const unsigned int L1_size = ci->get_L1_cache_size();
         const unsigned int L2_size = ci->get_L2_cache_size();
 
@@ -426,7 +371,8 @@ public:
 
         // x_block: Work out how many rows (of length k_block) will fit in the L2
         // Don't allocate more than 90% of the L2 to allow for overheads, and subtract off the L1 contents.
-        _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) / (sizeof(Toi) * _k_block);
+        _x_block = (((L2_size * 9) / 10) - (_k_block * sizeof(Toi) * (strategy::out_width + strategy::out_height))) /
+                  (sizeof(Toi) * _k_block);
 
         // Needs to be (at least a single) multiple of the kernel output width.
         _x_block /= strategy::out_width;
@@ -434,7 +380,7 @@ public:
 
         // And tune to the presented problem size.
         int num_x_blocks = iceildiv(N, _x_block);
-        _x_block         = iceildiv(N, num_x_blocks);
+        _x_block = iceildiv(N, num_x_blocks);
 
         _x_block = iceildiv(_x_block, strategy::out_width);
         _x_block *= strategy::out_width;
@@ -450,45 +396,36 @@ public:
     // out work in units of out_height.  Factor batches into the window, but
     // not multi for now (as this would cause problems with the buffer
     // manager).
-
-    unsigned int get_window_size() const override
-    {
+    unsigned int get_window_size() const override {
         // _Mround is a multiple of out_height by definition.
         return (_Mround / strategy::out_height) * _nbatches;
     }
 
     // set_nthreads: pass on to buffer manager to avoid it waiting for non-existant threads.
-    void set_nthreads(int nthreads) override
-    {
-        if(_bm)
-        {
-            _bm->set_nthreads(nthreads);
+    void set_nthreads(int nthreads) override {
+        _nthreads = std::min(nthreads, _maxthreads);
+        if (_bm) {
+            _bm->set_nthreads(_nthreads);
         }
     }
 
     // Execute
-    void execute(unsigned int start, unsigned int end, int threadid) override
-    {
-        if(_pretransposed)
-        {
+    void execute(unsigned int start, unsigned int end, int threadid) override {
+        if (_pretransposed) {
             execute_internal<true>(start, end, threadid);
-        }
-        else
-        {
+        } else {
             execute_internal<false>(start, end, threadid);
         }
     }
 
     // Interface implementation - working space
-    size_t get_working_size() const override
-    {
+    size_t get_working_size() const override {
         // In all cases, we need one A buffer plus a C buffer per thread.
         size_t size = get_a_working_size() + (get_c_working_size() * _maxthreads);
 
         // For pretransposed case, there is no working space needed for B.
         // Otherwise, we need a BufferManager.
-        if(!_pretransposed)
-        {
+        if (!_pretransposed) {
             size += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
         }
 
@@ -497,33 +434,28 @@ public:
         return size;
     }
 
-    void set_working_space(void *working_space) override
-    {
+    void set_working_space(void *working_space) override {
         // Make sure everything ends up cache line aligned
         int8_t *working_space_bytes = reinterpret_cast<int8_t *>(working_space);
-        intptr_t working_space_int   = reinterpret_cast<intptr_t>(working_space);
+        intptr_t working_space_int = reinterpret_cast<intptr_t>(working_space);
 
-        size_t diff = 0;
+        size_t diff=0;
 
-        if(working_space_int & 0x3F)
-        {
+        if (working_space_int & 0x3F) {
             diff = 0x40 - (working_space_int & 0x3F);
         }
 
         working_space_bytes += diff;
 
-        if(_pretransposed)
-        {
+        if (_pretransposed) {
             // Pretransposed case: just set internal pointer to parameter value.
             _working_space = reinterpret_cast<void *>(working_space_bytes);
-        }
-        else
-        {
+        } else {
             // Otherwise, use the first part of the working space for the buffer manager.
             // It's legal to call this again so don't leak a buffer manager if it already existed.
             delete _bm;
 
-            _bm = new BufferManager(_maxthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
+            _bm = new BufferManager(_nthreads, get_b_working_size(), reinterpret_cast<void *>(working_space_bytes));
 
             working_space_bytes += BufferManager::get_storage_requirement(_maxthreads, get_b_working_size());
 
@@ -532,24 +464,20 @@ public:
     }
 
     // Interface implementation - pretransposed
-    bool B_is_pretransposed() const override
-    {
+    bool B_is_pretransposed() const override {
         return _pretransposed;
     }
 
-    bool B_pretranspose_required() const override
-    {
-        return _pretransposed && (_B_transposed == nullptr);
+    bool B_pretranspose_required() const override {
+        return _pretransposed && (_B_transposed==nullptr);
     }
 
     // TODO: this could almost certainly be considerably simpler.
-    size_t get_B_pretransposed_array_size() const override
-    {
-        size_t      total = 0;
+    size_t get_B_pretransposed_array_size() const override {
+        size_t total=0;
         blockwalker current(*this);
 
-        do
-        {
+        do {
             /* Figure out the size of each block. */
             size_t x_size = (current.xmax() - current.x0());
             size_t k_size = (current.kmax() - current.k0());
@@ -562,20 +490,17 @@ public:
             k_size *= strategy::k_unroll;
 
             total += x_size * k_size * sizeof(Toi);
-        }
-        while(current.advance());
+        } while (current.advance());
 
         return total;
     }
 
-    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override
-    {
+    void pretranspose_B_array(void *in_buffer, const To *B, const int ldb, const int B_multi_stride) override {
         blockwalker current(*this);
-        Toi        *buffer = reinterpret_cast<Toi *>(in_buffer);
-        _B_transposed      = buffer;
+        Toi *buffer = reinterpret_cast<Toi *>(in_buffer);
+        _B_transposed = buffer;
 
-        do
-        {
+        do {
             /* Figure out the size of each block. */
             size_t x_size = (current.xmax() - current.x0());
             size_t k_size = (current.kmax() - current.k0());
@@ -587,31 +512,25 @@ public:
             k_size = iceildiv(k_size, strategy::k_unroll);
             k_size *= strategy::k_unroll;
 
-            if(_trB ^ strategy::B_transpose)
-            {
+            if (_trB ^ strategy::B_transpose) {
                 Transform<strategy::B_interleave, strategy::B_block, true>(
-                    buffer, B + (current.multi() * B_multi_stride), ldb,
-                    current.x0(), current.xmax(), current.k0(), current.kmax());
-            }
-            else
-            {
+                           buffer, B + (current.multi() * B_multi_stride), ldb,
+                           current.x0(), current.xmax(), current.k0(), current.kmax());
+            } else {
                 Transform<strategy::B_interleave, strategy::B_block, false>(
-                    buffer, B + (current.multi() * B_multi_stride), ldb,
-                    current.x0(), current.xmax(), current.k0(), current.kmax());
+                           buffer, B + (current.multi() * B_multi_stride), ldb,
+                           current.x0(), current.xmax(), current.k0(), current.kmax());
             }
 
             buffer += (x_size * k_size);
-        }
-        while(current.advance());
+        } while (current.advance());
     }
 
-    void set_pretransposed_B_data(void *in_buffer) override
-    {
+    void set_pretransposed_B_data(void *in_buffer) override {
         _B_transposed = reinterpret_cast<Toi *>(in_buffer);
     }
 
-    ~GemmInterleaved() override
-    {
+    ~GemmInterleaved() override {
         delete _bm;
     }
 };
-- 
cgit v1.2.1