From cc6129c06af98616a0e4d68475cfa3d92aaf63b3 Mon Sep 17 00:00:00 2001
From: Isabella Gottardi <isabella.gottardi@arm.com>
Date: Fri, 14 Dec 2018 11:40:40 +0000
Subject: COMPMID-1812: CLSpaceToBatch paddings not calculated correctly

Change-Id: I63fed6799c4ed2848ff80cd7458124692a52bb98
Reviewed-on: https://review.mlplatform.org/400
Tested-by: Arm Jenkins <bsgcomp@arm.com>
Reviewed-by: Michele Di Giorgio <michele.digiorgio@arm.com>
Reviewed-by: Michalis Spyrou <michalis.spyrou@arm.com>
---
 .../runtime/CL/functions/CLSpaceToBatchLayer.h     |  9 +++-
 src/core/CL/cl_kernels/space_to_batch.cl           | 60 +++++++++++++---------
 src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp  | 16 +++++-
 src/runtime/CL/functions/CLSpaceToBatchLayer.cpp   | 34 +++++-------
 tests/datasets/SpaceToBatchDataset.h               | 16 +++---
 tests/validation/CL/SpaceToBatchLayer.cpp          |  8 +--
 tests/validation/reference/SpaceToBatch.cpp        | 44 ++++++++++------
 7 files changed, 111 insertions(+), 76 deletions(-)

diff --git a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
index 6478774701..9b1dd979ec 100644
--- a/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
+++ b/arm_compute/runtime/CL/functions/CLSpaceToBatchLayer.h
@@ -26,6 +26,7 @@
 
 #include "arm_compute/runtime/IFunction.h"
 
+#include "arm_compute/core/CL/kernels/CLMemsetKernel.h"
 #include "arm_compute/core/CL/kernels/CLSpaceToBatchLayerKernel.h"
 #include "arm_compute/core/Types.h"
 #include "arm_compute/runtime/CL/CLTensor.h"
@@ -34,7 +35,11 @@ namespace arm_compute
 {
 class ICLTensor;
 
-/** Basic function to run @ref CLSpaceToBatchLayerKernel. */
+/** Basic function to spatial divide a tensor. This function calls the following OpenCL kernels/functions:
+ *
+ *  -# @ref CLMemsetKernel
+ *  -# @ref CLSpaceToBatchLayerKernel
+ */
 class CLSpaceToBatchLayer : public IFunction
 {
 public:
@@ -96,7 +101,7 @@ public:
 
 private:
     CLSpaceToBatchLayerKernel _space_to_batch_kernel; /**< SpaceToBatch kernel to run */
-    ICLTensor                *_output;                /**< Output tensor */
+    CLMemsetKernel            _memset_kernel;         /**< Memset kernel to run */
     bool                      _has_padding;           /**< Flag to check if the output has padding */
 };
 } // namespace arm_compute
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/space_to_batch.cl
index d42a79d3ff..79343d49c7 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/space_to_batch.cl
@@ -23,7 +23,7 @@
  */
 #include "helpers.h"
 
-#if defined(BATCH_SIZE) && defined(DATA_TYPE)
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
 /** Calculate the space to batch conversion.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -83,12 +83,15 @@ __kernel void space_to_batch_nchw(
     const int out_y = get_global_id(1);
     const int z     = get_global_id(2);
 
-    if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
     }
 }
@@ -151,18 +154,21 @@ __kernel void space_to_batch_nhwc(
     const int out_y = get_global_id(2);
     const int z     = get_global_id(0);
 
-    if((out_x >= pad_left_x && out_x < WIDTH_OUT - pad_right_x) && (out_y >= pad_left_y && out_y < HEIGHT_OUT - pad_right_y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - pad_left_x) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - pad_left_y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - pad_left_x;
+        const int in_y = pos_y - pad_left_y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
     }
 }
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
 
-#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
 /** Calculate the space to batch conversion.
  *
  * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -207,12 +213,15 @@ __kernel void space_to_batch_static_nchw(
     const int out_y = get_global_id(1);
     const int z     = get_global_id(2);
 
-    if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
     }
 }
@@ -260,13 +269,16 @@ __kernel void space_to_batch_static_nhwc(
     const int out_y = get_global_id(2);
     const int z     = get_global_id(0);
 
-    if((out_x >= PAD_LEFT_X && out_x < WIDTH_OUT - PAD_RIGHT_X) && (out_y >= PAD_LEFT_Y && out_y < HEIGHT_OUT - PAD_RIGHT_Y))
+    const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+    const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+    if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
     {
-        const int r                      = (BATCH_SIZE / (block_x * block_y));
-        const int w                      = batch_id % r;
-        const int in_x                   = (out_x - PAD_LEFT_X) * block_x + (batch_id / r) % block_x;
-        const int in_y                   = (out_y - PAD_LEFT_Y) * block_y + (batch_id / r) / block_x;
+        const int w    = batch_id % BATCH_IN;
+        const int in_x = pos_x - PAD_LEFT_X;
+        const int in_y = pos_y - PAD_LEFT_Y;
+
         *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
     }
 }
-#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y)  && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
index d488631ae9..f0391989a7 100644
--- a/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
+++ b/src/core/CL/kernels/CLSpaceToBatchLayerKernel.cpp
@@ -39,10 +39,16 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *block_inf
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, block_info, padddings, output);
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(block_info, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4);
+    ARM_COMPUTE_RETURN_ERROR_ON(block_info->num_dimensions() > 1);
+    ARM_COMPUTE_RETURN_ERROR_ON(padddings->num_dimensions() > 2);
+    ARM_COMPUTE_RETURN_ERROR_ON(padddings->tensor_shape()[1] != block_info->tensor_shape()[0]);
 
     // Validate output if initialized
     if(output->total_size() != 0)
     {
+        const DataLayout data_layout = input->data_layout();
+        const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
+        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
     }
 
@@ -64,8 +70,8 @@ Status validate_arguments_static(const ITensorInfo *input, const int block_shape
         const int        idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL);
         const int        idx_batch   = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES);
         ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_width] < padding_left.x() + padding_right.y());
-        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_width] / block_shape_x != (output->tensor_shape()[idx_width] - padding_left.x() - padding_right.y()));
-        ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_height] / block_shape_y != (output->tensor_shape()[idx_height] - padding_left.x() - padding_right.y()));
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_width] + padding_left.x() + padding_right.x()) % block_shape_x != 0);
+        ARM_COMPUTE_RETURN_ERROR_ON((input->tensor_shape()[idx_height] + padding_left.y() + padding_right.y()) % block_shape_y != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(input->tensor_shape()[idx_channel] != output->tensor_shape()[idx_channel]);
         ARM_COMPUTE_RETURN_ERROR_ON(output->tensor_shape()[idx_batch] % (block_shape_x * block_shape_y) != 0);
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -101,6 +107,9 @@ void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const ICLTenso
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+    build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+    build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("space_to_batch_" + lower_string(string_from_data_layout(input->info()->data_layout())), build_opts.options()));
 
     // Configure kernel window
@@ -132,6 +141,9 @@ void CLSpaceToBatchLayerKernel::configure(const ICLTensor *input, const int bloc
     build_opts.add_option("-DWIDTH_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_width)));
     build_opts.add_option("-DHEIGHT_OUT=" + support::cpp11::to_string(output->info()->dimension(idx_height)));
     build_opts.add_option("-DBATCH_SIZE=" + support::cpp11::to_string(output->info()->dimension(idx_batch)));
+    build_opts.add_option("-DWIDTH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_width)));
+    build_opts.add_option("-DHEIGHT_IN=" + support::cpp11::to_string(input->info()->dimension(idx_height)));
+    build_opts.add_option("-DBATCH_IN=" + support::cpp11::to_string(input->info()->dimension(idx_batch)));
     build_opts.add_option("-DBLOCK_SHAPE_X=" + support::cpp11::to_string(block_shape_x));
     build_opts.add_option("-DBLOCK_SHAPE_Y=" + support::cpp11::to_string(block_shape_y));
     build_opts.add_option("-DPAD_LEFT_X=" + support::cpp11::to_string(padding_left.x()));
diff --git a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
index 76c1e188e6..a24b72e461 100644
--- a/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
+++ b/src/runtime/CL/functions/CLSpaceToBatchLayer.cpp
@@ -33,20 +33,19 @@
 namespace arm_compute
 {
 CLSpaceToBatchLayer::CLSpaceToBatchLayer()
-    : _space_to_batch_kernel(), _output(nullptr), _has_padding(false)
+    : _space_to_batch_kernel(), _memset_kernel(), _has_padding(false)
 {
 }
 
 void CLSpaceToBatchLayer::configure(const ICLTensor *input, const ICLTensor *block_shape, const ICLTensor *paddings, ICLTensor *output)
 {
-    ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_ON_NULLPTR(input, block_shape, paddings, output);
 
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape, paddings, output);
 }
 
@@ -57,42 +56,35 @@ void CLSpaceToBatchLayer::configure(const ICLTensor *input, const int block_shap
     if(input->info()->tensor_shape().total_size() != output->info()->tensor_shape().total_size())
     {
         _has_padding = true;
+        _memset_kernel.configure(output, PixelValue());
     }
-
-    _output = output;
     _space_to_batch_kernel.configure(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const ITensorInfo *block_shape, const ITensorInfo *paddings, const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape, paddings, output));
+
+    return Status{};
 }
 
 Status CLSpaceToBatchLayer::validate(const ITensorInfo *input, const int block_shape_x, const int block_shape_y, const Size2D &padding_left, const Size2D &padding_right,
                                      const ITensorInfo *output)
 {
-    return CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output);
+    ARM_COMPUTE_RETURN_ON_ERROR(CLMemsetKernel::validate(output, PixelValue()));
+    ARM_COMPUTE_RETURN_ON_ERROR(CLSpaceToBatchLayerKernel::validate(input, block_shape_x, block_shape_y, padding_left, padding_right, output));
+
+    return Status{};
 }
 
 void CLSpaceToBatchLayer::run()
 {
     // Zero out output only if we have paddings
-    // TODO(micspy01): replace with memset once ready
     if(_has_padding)
     {
-        _output->map(CLScheduler::get().queue(), true);
-        if(is_data_type_quantized_asymmetric(_output->info()->data_type()))
-        {
-            const uint8_t quantized_zero = _output->info()->quantization_info().offset;
-            std::fill_n(_output->buffer(), _output->info()->total_size(), quantized_zero);
-        }
-        else
-        {
-            memset(_output->buffer(), 0, _output->info()->total_size());
-        }
-        _output->unmap(CLScheduler::get().queue());
+        CLScheduler::get().enqueue(_memset_kernel, true);
     }
-
     CLScheduler::get().enqueue(_space_to_batch_kernel, true);
 }
 } // namespace arm_compute
diff --git a/tests/datasets/SpaceToBatchDataset.h b/tests/datasets/SpaceToBatchDataset.h
index 0ab06f9327..37b0f2ee54 100644
--- a/tests/datasets/SpaceToBatchDataset.h
+++ b/tests/datasets/SpaceToBatchDataset.h
@@ -119,12 +119,12 @@ class SmallSpaceToBatchLayerDataset final : public SpaceToBatchLayerDataset
 public:
     SmallSpaceToBatchLayerDataset()
     {
-        add_config(TensorShape(2U, 2U, 1U, 1U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(1U, 1U, 1U, 4U));
-        add_config(TensorShape(6U, 2U, 1U, 1U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(3U, 1U, 1U, 4U));
-        add_config(TensorShape(2U, 4U, 2U, 1U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(1U, 2U, 2U, 4U));
-        add_config(TensorShape(2U, 6U, 1U, 2U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(1U, 3U, 1U, 8U));
-        add_config(TensorShape(6U, 8U, 1U, 1U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(3U, 4U, 1U, 4U));
-        add_config(TensorShape(6U, 8U, 15U, 5U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(3U, 4U, 15U, 20U));
+        add_config(TensorShape(2U, 2U, 1U, 1U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(1U, 1U, 1U, 4U));
+        add_config(TensorShape(6U, 2U, 1U, 1U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(3U, 1U, 1U, 4U));
+        add_config(TensorShape(2U, 4U, 2U, 1U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(1U, 2U, 2U, 4U));
+        add_config(TensorShape(2U, 6U, 1U, 2U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(1U, 3U, 1U, 8U));
+        add_config(TensorShape(6U, 8U, 1U, 1U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(3U, 4U, 1U, 4U));
+        add_config(TensorShape(6U, 8U, 15U, 5U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(3U, 4U, 15U, 20U));
     }
 };
 class LargeSpaceToBatchLayerDataset final : public SpaceToBatchLayerDataset
@@ -132,8 +132,8 @@ class LargeSpaceToBatchLayerDataset final : public SpaceToBatchLayerDataset
 public:
     LargeSpaceToBatchLayerDataset()
     {
-        add_config(TensorShape(128U, 64U, 2U, 1U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(64U, 32U, 2U, 4U));
-        add_config(TensorShape(512U, 64U, 2U, 1U), TensorShape(2U), TensorShape(2U, 4U), TensorShape(128U, 16U, 2U, 16U));
+        add_config(TensorShape(128U, 64U, 2U, 1U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(64U, 32U, 2U, 4U));
+        add_config(TensorShape(512U, 64U, 2U, 1U), TensorShape(2U), TensorShape(2U, 2U), TensorShape(128U, 16U, 2U, 16U));
     }
 };
 } // namespace datasets
diff --git a/tests/validation/CL/SpaceToBatchLayer.cpp b/tests/validation/CL/SpaceToBatchLayer.cpp
index 1c3d54dc81..00ed1fa820 100644
--- a/tests/validation/CL/SpaceToBatchLayer.cpp
+++ b/tests/validation/CL/SpaceToBatchLayer.cpp
@@ -75,10 +75,10 @@ DATA_TEST_CASE(Validate, framework::DatasetMode::ALL, zip(zip(zip(zip(
                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2U), 1, DataType::F32),    // Wrong data type block shape
                                                        TensorInfo(TensorShape(32U, 13U, 2U, 2U, 4U), 1, DataType::F32),    // Wrong tensor shape
                                                      }),
-               framework::dataset::make("BlockShapeInfo",{ TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::F16),
-                                                       TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
+               framework::dataset::make("BlockShapeInfo",{ TensorInfo(TensorShape(2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(2U), 1, DataType::S32),
+                                                       TensorInfo(TensorShape(2U), 1, DataType::F16),
+                                                       TensorInfo(TensorShape(2U), 1, DataType::S32),
                                                      })),
                framework::dataset::make("PaddingsShapeInfo",{ TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
                                                        TensorInfo(TensorShape(2U, 2U), 1, DataType::S32),
diff --git a/tests/validation/reference/SpaceToBatch.cpp b/tests/validation/reference/SpaceToBatch.cpp
index 979ab94b33..c635d4abfd 100644
--- a/tests/validation/reference/SpaceToBatch.cpp
+++ b/tests/validation/reference/SpaceToBatch.cpp
@@ -39,38 +39,52 @@ SimpleTensor<T> space_to_batch(const SimpleTensor<T> &src, const SimpleTensor<in
 {
     SimpleTensor<T> result(dst_shape, src.data_type());
 
-    auto width_out  = static_cast<int>(dst_shape[0]);
-    auto height_out = static_cast<int>(dst_shape[1]);
-    auto z_out      = static_cast<int>(dst_shape[2]);
+    const auto width_out  = static_cast<int>(dst_shape[0]);
+    const auto height_out = static_cast<int>(dst_shape[1]);
+    const auto batch_out  = static_cast<int>(dst_shape[3]);
+
+    const auto width_in  = static_cast<int>(src.shape()[0]);
+    const auto height_in = static_cast<int>(src.shape()[1]);
+    const auto batch_in  = static_cast<int>(src.shape()[3]);
+
+    const auto channel = static_cast<int>(src.shape()[2]);
+
+    const auto block_width  = block_shape[0];
+    const auto block_height = block_shape[1];
+
+    const auto padding_left = paddings[0];
+    const auto padding_top  = paddings[2];
 
     int out_pos = 0;
-    for(int batch = 0; batch < static_cast<int>(dst_shape[3]); ++batch)
+    for(int outB = 0; outB < batch_out; ++outB)
     {
-        for(int z = 0; z < z_out; ++z)
+        unsigned int inB = outB % batch_in;
+
+        int shift_w = (outB / batch_in) % block_width;
+        int shift_h = (outB / batch_in) / block_width;
+
+        for(int c = 0; c < channel; ++c)
         {
-            for(int y = 0; y < height_out; ++y)
+            for(int outH = 0; outH < height_out; ++outH)
             {
-                for(int x = 0; x < width_out; ++x)
+                for(int outW = 0; outW < width_out; ++outW)
                 {
-                    if(x < paddings[0] || x > width_out - paddings[1] - 1
-                       || y < paddings[2] || y > height_out - paddings[3] - 1)
+                    const auto in_pos = ((inB * channel + c) * height_in + ((outH * block_height + shift_h) - padding_top)) * width_in + (outW * block_width + shift_w) - padding_left;
+
+                    if(outH * block_height + shift_h < padding_top || outH * block_height + shift_h >= padding_top + height_in || outW * block_width + shift_w < padding_left
+                       || outW * block_width + shift_w >= padding_left + width_in)
                     {
                         result[out_pos] = 0;
                     }
                     else
                     {
-                        const int r      = dst_shape[3] / (block_shape[0] * block_shape[1]);
-                        const int in_x   = (block_shape[0] * (x - paddings[0]) + (batch / r) % block_shape[0]);
-                        const int in_y   = (block_shape[1] * (y - paddings[2]) + (batch / r) / block_shape[0]);
-                        int       in_pos = in_x + src.shape()[0] * in_y + z * src.shape()[0] * src.shape()[1] + (batch % r) * src.shape()[0] * src.shape()[1] * src.shape()[2];
-                        result[out_pos]  = src[in_pos];
+                        result[out_pos] = src[in_pos];
                     }
                     ++out_pos;
                 }
             }
         }
     }
-
     return result;
 }
 
-- 
cgit v1.2.1