COMPMID-1736: Fixed out-of-bound write in CLIm2Col

The issue was related to CLIm2Col when the number of input channels was less than the number of elements processed by each thread. The bug has been fixed in the validate_and_configure_window() function setting the correct number of elements accessed in the output tensor. Also fixed an issue GEMM3D when we have a single output channel Change-Id: I094292d0c7662599c4a4c3916ec5f5821df5faef
author: Gian Marco Iodice <gianmarco.iodice@arm.com> 2018-11-05 14:26:32 +0000
committer: Gian Marco Iodice <gianmarco.iodice@arm.com> 2018-11-08 13:31:33 +0000
commit: 3139f03a74ede3b3bd7cfc6ff219e6c9bc556632 (patch)
tree: cd67b8c9ec1688fcf739c576553b10e435e6938e
parent: 421405b6a21b124288a750e2da26dc01eb7391cb (diff)
download: ComputeLibrary-3139f03a74ede3b3bd7cfc6ff219e6c9bc556632.tar.gz
17 files changed, 104 insertions, 54 deletions
diff --git a/arm_compute/core/Types.h b/arm_compute/core/Types.h
index cb1a212797..0240916da8 100644
--- a/arm_compute/core/Types.h
+++ b/arm_compute/core/Types.h
@@ -1192,7 +1192,7 @@ class GEMMReshapeInfo final
 public:
     /** Default constructor */
     GEMMReshapeInfo()
-        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(1), _reinterpret_input_as_3d(false)
+        : _m(1), _n(1), _k(1), _mult_transpose1xW_width(1), _mult_interleave4x4_height(1), _depth_output_gemm3d(0), _reinterpret_input_as_3d(false)
     {
     }
     /** Constructor
@@ -1202,11 +1202,12 @@ public:
      * @param[in] k                         Number of matrix A columns or matrix B rows
      * @param[in] mult_transpose1xW_width   (Optional) Multiplication factor for the width of the 1xW transposed block
      * @param[in] mult_interleave4x4_height (Optional) Multiplication factor for the height of the 4x4 interleaved block
-     * @param[in] depth_output_gemm3d       (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel
+     * @param[in] depth_output_gemm3d       (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel.
+     *                                      If 0 the output will not be reinterpreted as 3D. Default 0
      * @param[in] reinterpret_input_as_3d   (Optional) Reinterpret the input as 3D tensor. (i.e. this flag should be set to true when GEMM is used
      *                                                 to perform 1x1 convolutions with the NHWC data layout)
      */
-    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 1, bool reinterpret_input_as_3d = false)
+    GEMMReshapeInfo(int m, int n, int k, int mult_transpose1xW_width = 1, int mult_interleave4x4_height = 1, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false)
         : _m(m), _n(n), _k(k), _mult_transpose1xW_width(mult_transpose1xW_width), _mult_interleave4x4_height(mult_interleave4x4_height), _depth_output_gemm3d(depth_output_gemm3d),
           _reinterpret_input_as_3d(reinterpret_input_as_3d)
     {
@@ -1311,7 +1312,7 @@ class GEMMInfo
 public:
     /** Default constructor */
     GEMMInfo()
-        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _depth_output_gemm3d(1), _reinterpret_input_as_3d(false), _retain_internal_weights(false), _gemmlowp_output_stage()
+        : _is_a_reshaped(false), _is_b_reshaped(false), _reshape_b_only_on_first_run(false), _depth_output_gemm3d(0), _reinterpret_input_as_3d(false), _retain_internal_weights(false), _gemmlowp_output_stage()
     {
     }
     /** Constructor
@@ -1320,13 +1321,14 @@ public:
      * @param[in] is_b_reshaped               True if the matrix B has been reshaped
      * @param[in] reshape_b_only_on_first_run Reshape matrix B only for the first run
      * @param[in] depth_output_gemm3d         (Optional) Depth (third dimension) of the output tensor to be used with the GEMM3D kernel
+     *                                        If 0 the output will not be reinterpreted as 3D. Default 0
      * @param[in] reinterpret_input_as_3d     (Optional) Reinterpret the input as 3D tensor. (i.e. this flag should be set to true when GEMM is used
      *                                        to perform 1x1 convolutions with the NHWC data layout)
      * @param[in] retain_internal_weights     (Optional) Retain the weights tensor from previous run
      * @param[in] gemmlowp_output_stage       (Optional) GEMMLowp Output stage info
      *
      */
-    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 1, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
+    GEMMInfo(bool is_a_reshaped, bool is_b_reshaped, bool reshape_b_only_on_first_run, int depth_output_gemm3d = 0, bool reinterpret_input_as_3d = false, bool retain_internal_weights = false,
              GEMMLowpOutputStageInfo gemmlowp_output_stage = GEMMLowpOutputStageInfo())
         : _is_a_reshaped(is_a_reshaped), _is_b_reshaped(is_b_reshaped), _reshape_b_only_on_first_run(reshape_b_only_on_first_run), _depth_output_gemm3d(depth_output_gemm3d),
           _reinterpret_input_as_3d(reinterpret_input_as_3d), _retain_internal_weights(retain_internal_weights), _gemmlowp_output_stage(gemmlowp_output_stage)
diff --git a/arm_compute/core/utils/misc/ShapeCalculator.h b/arm_compute/core/utils/misc/ShapeCalculator.h
index 56f65d0ba8..1f532ca31e 100644
--- a/arm_compute/core/utils/misc/ShapeCalculator.h
+++ b/arm_compute/core/utils/misc/ShapeCalculator.h
@@ -532,13 +532,14 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
     ARM_COMPUTE_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The first input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true");
 
     const bool reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
-    const bool reinterpret_output_as_3d = reshape_info.depth_output_gemm3d() != 1;
+    const bool reinterpret_output_as_3d = reshape_info.depth_output_gemm3d() != 0;
+    const int  depth_output_gemm3d      = reinterpret_output_as_3d ? reshape_info.depth_output_gemm3d() : 1;
     const int  m                        = reshape_info.reinterpret_input_as_3d() ? input0.dimension(1) * input0.dimension(2) : input0.dimension(1);
 
     // If the output of GEMM has to be reinterpreted as 3D, the number of input0 rows (M) is obtained collapsing the second and third
     // dimension of the output tensor
     const int dim0 = is_interleaved_transposed ? reshape_info.n() : input1.dimension(0);
-    const int dim1 = is_interleaved_transposed ? reshape_info.m() / reshape_info.depth_output_gemm3d() : m / reshape_info.depth_output_gemm3d();
+    const int dim1 = is_interleaved_transposed ? reshape_info.m() / depth_output_gemm3d : m / depth_output_gemm3d;
     const int dim2 = reinterpret_input_as_3d ? input0.tensor_shape()[3] : input0.tensor_shape()[2];
     const int dim3 = reinterpret_input_as_3d ? 1 : input0.tensor_shape()[3];
 
@@ -546,7 +547,7 @@ inline TensorShape compute_mm_shape(const ITensorInfo &input0, const ITensorInfo
 
     output_shape.set(0, dim0);
     output_shape.set(1, dim1);
-    output_shape.set(2, reinterpret_output_as_3d ? reshape_info.depth_output_gemm3d() : dim2);
+    output_shape.set(2, reinterpret_output_as_3d ? depth_output_gemm3d : dim2);
     output_shape.set(3, reinterpret_output_as_3d ? dim2 : dim3);
     output_shape.set(4, reinterpret_output_as_3d ? dim3 : 1);
 
diff --git a/arm_compute/core/utils/quantization/AsymmHelpers.h b/arm_compute/core/utils/quantization/AsymmHelpers.h
index 6fd1d80010..d9f20cee2b 100644
--- a/arm_compute/core/utils/quantization/AsymmHelpers.h
+++ b/arm_compute/core/utils/quantization/AsymmHelpers.h
@@ -38,7 +38,7 @@ namespace quantization
  *
  * @return a status
  */
-arm_compute::Status calculate_quantized_multiplier_less_than_one(double multiplier, int *quant_multiplier, int *right_shift);
+arm_compute::Status calculate_quantized_multiplier_less_than_one(float multiplier, int *quant_multiplier, int *right_shift);
 /** Calculate quantized representation of multiplier having value greater than one.
  *
  * @param[in]  multiplier           Real multiplier.
@@ -47,7 +47,7 @@ arm_compute::Status calculate_quantized_multiplier_less_than_one(double multipli
  *
  * @return a status
  */
-arm_compute::Status calculate_quantized_multiplier_greater_than_one(double multiplier, int *quantized_multiplier, int *left_shift);
+arm_compute::Status calculate_quantized_multiplier_greater_than_one(float multiplier, int *quantized_multiplier, int *left_shift);
 } // namespace quantization
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_IO_FILE_HANDLER_H__ */
diff --git a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
index 73b1d41eb1..b2fb3e0278 100644
--- a/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpMatrixMultiplyKernel.cpp
@@ -112,7 +112,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input0, ITe
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool          reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
-    bool          reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 1);
+    bool          reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
 
     Window win{};
     Window win_out{};
@@ -227,7 +227,7 @@ void CLGEMMLowpMatrixMultiplyKernel::configure(const ICLTensor *input0, const IC
     _input1                   = input1;
     _output                   = output;
     _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 1);
+    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
diff --git a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
index 715edae606..5e02dda9e3 100644
--- a/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMMatrixMultiplyKernel.cpp
@@ -110,7 +110,7 @@ inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *inpu
     unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0];
     unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1];
     bool           reinterpret_input_as_3d             = reshape_info.reinterpret_input_as_3d();
-    bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 1);
+    bool           reinterpret_output_as_3d            = (reshape_info.depth_output_gemm3d() != 0);
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
@@ -227,7 +227,7 @@ void CLGEMMMatrixMultiplyKernel::configure(const ICLTensor *input0, const ICLTen
     _input1                   = input1;
     _output                   = output;
     _reinterpret_input_as_3d  = reshape_info.reinterpret_input_as_3d();
-    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 1);
+    _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0);
 
     // In case both input and output have to be reinterpreted as 3D tensors,
     // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false.
diff --git a/src/core/CL/kernels/CLIm2ColKernel.cpp b/src/core/CL/kernels/CLIm2ColKernel.cpp
index 0ba0d0e2f9..54ef23f2a2 100644
--- a/src/core/CL/kernels/CLIm2ColKernel.cpp
+++ b/src/core/CL/kernels/CLIm2ColKernel.cpp
@@ -109,7 +109,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
         const int yin_end   = input->dimension(1);
 
         const int xout_start = 0;
-        const int xout_end   = input->dimension(0) < num_elems_processed_per_iteration ? ceil_to_multiple(output->dimension(0), num_elems_processed_per_iteration) : output->dimension(0);
+        const int xout_end   = input->dimension(0) < num_elems_processed_per_iteration ? output->dimension(0) + (num_elems_processed_per_iteration - input->dimension(0)) : output->dimension(0);
         const int yout_start = 0;
         const int yout_end   = output->dimension(1);
 
diff --git a/src/core/utils/quantization/AsymmHelpers.cpp b/src/core/utils/quantization/AsymmHelpers.cpp
index 8bb6d8e173..ea9ba776a9 100644
--- a/src/core/utils/quantization/AsymmHelpers.cpp
+++ b/src/core/utils/quantization/AsymmHelpers.cpp
@@ -1,5 +1,5 @@
 /*
- * Copyright (c) 2017 ARM Limited.
+ * Copyright (c) 2017-2018 ARM Limited.
  *
  * SPDX-License-Identifier: MIT
  *
@@ -30,21 +30,30 @@
 using namespace arm_compute::quantization;
 
 constexpr int64_t fixed_point_one_Q0 = (1ll << 31);
+constexpr float   epsilon            = 0.00001f;
 
-arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(double multiplier,
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_less_than_one(float multiplier,
                                                                                             int   *quant_multiplier,
                                                                                             int   *right_shift)
 {
     ARM_COMPUTE_RETURN_ERROR_ON(quant_multiplier == nullptr);
     ARM_COMPUTE_RETURN_ERROR_ON(right_shift == nullptr);
-    ARM_COMPUTE_RETURN_ERROR_ON(multiplier < 0);
-    ARM_COMPUTE_RETURN_ERROR_ON(multiplier >= 1);
-    if(multiplier == 0)
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier < -epsilon);
+    ARM_COMPUTE_RETURN_ERROR_ON(multiplier > 1.0f + epsilon);
+    if(std::fabs(1.0f - multiplier) < epsilon)
+    {
+        *quant_multiplier = 1;
+        *right_shift      = 0;
+        return arm_compute::Status{};
+    }
+
+    if(std::fabs(0.0f - multiplier) < epsilon)
     {
         *quant_multiplier = 0;
         *right_shift      = 0;
         return arm_compute::Status{};
     }
+
     const double q = std::frexp(multiplier, right_shift);
     *right_shift *= -1;
     auto q_fixed = static_cast<int64_t>(round(q * fixed_point_one_Q0));
@@ -61,7 +70,7 @@ arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_le
     return arm_compute::Status{};
 }
 
-arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(double multiplier,
+arm_compute::Status arm_compute::quantization::calculate_quantized_multiplier_greater_than_one(float multiplier,
                                                                                                int   *quantized_multiplier,
                                                                                                int   *left_shift)
 {
diff --git a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
index c5637dba26..6a2aac6457 100644
--- a/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
+++ b/src/runtime/CL/functions/CLFullyConnectedLayer.cpp
@@ -101,7 +101,7 @@ void CLFullyConnectedLayer::configure_mm(const ICLTensor *input, const ICLTensor
     else
     {
         // Configure matrix multiply kernel
-        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 1, false, retain_internal_weights));
+        _mm_gemm.configure(input, weights, nullptr, output, 1.f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run */, 0, false, retain_internal_weights));
     }
 }
 
diff --git a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
index 67f55d56e2..4825d878f8 100644
--- a/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
+++ b/src/runtime/CL/functions/CLGEMMConvolutionLayer.cpp
@@ -276,14 +276,16 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
     {
         const QuantizationInfo output_quant_info = (output->info()->total_size() == 0) ? input->info()->quantization_info() : output->info()->quantization_info();
 
-        float multiplier = input->info()->quantization_info().scale * weights->info()->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
+        const float multiplier  = (input->info()->quantization_info().scale * weights->info()->quantization_info().scale) / output_quant_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
         quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
 
         int min_activation = 0;
         int max_activation = 0;
 
         const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU,
+                                                                                   ActivationLayerInfo::ActivationFunction::BOUNDED_RELU,
                                                                                    ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU
                                                                                  };
 
@@ -308,7 +310,10 @@ void CLGEMMConvolutionLayer::configure(const ICLTensor *input, const ICLTensor *
     }
 
     // Configure and tune GEMM
-    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, gemmlowp_output_stage, (data_layout == DataLayout::NHWC) ? conv_h : 1);
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    configure_mm(gemm_input_to_use, &_weights_reshaped, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth);
 
     if(!_skip_im2col)
     {
@@ -454,9 +459,11 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
     {
         const QuantizationInfo output_quant_info = (output->total_size() == 0) ? input->quantization_info() : output->quantization_info();
 
-        float multiplier = input->quantization_info().scale * weights->quantization_info().scale / output_quant_info.scale;
-        int   output_multiplier, output_shift;
-        quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift);
+        const float multiplier  = (input->quantization_info().scale * weights->quantization_info().scale) / output_quant_info.scale;
+        int   output_multiplier = 0;
+        int   output_shift      = 0;
+
+        ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier_less_than_one(multiplier, &output_multiplier, &output_shift));
 
         int min_activation = 0;
         int max_activation = 0;
@@ -485,17 +492,20 @@ Status CLGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
 
             // If the activation layer is RELU, BOUNDED_RELU or LU_BOUNDED_RELU, we can use the GEMMLowp output stage to perform this operation
             is_activationlayer_enabled = false;
-
-            // Set the GEMMLowp output stage info
-            gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
-            gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
-            gemmlowp_output_stage.gemmlowp_shift      = output_shift;
-            gemmlowp_output_stage.gemmlowp_min_bound  = min_activation;
-            gemmlowp_output_stage.gemmlowp_max_bound  = max_activation;
         }
+
+        // Set the GEMMLowp output stage info
+        gemmlowp_output_stage.gemmlowp_offset     = output_quant_info.offset;
+        gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier;
+        gemmlowp_output_stage.gemmlowp_shift      = output_shift;
+        gemmlowp_output_stage.gemmlowp_min_bound  = min_activation;
+        gemmlowp_output_stage.gemmlowp_max_bound  = max_activation;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, skip_col2im ? conv_h : 1, skip_im2col));
+    // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0;
+
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col));
 
     // Validate Col2Im
     if(!skip_col2im)
diff --git a/src/runtime/NEON/functions/NEGEMM.cpp b/src/runtime/NEON/functions/NEGEMM.cpp
index 321ecf85d8..e8bf6732b2 100644
--- a/src/runtime/NEON/functions/NEGEMM.cpp
+++ b/src/runtime/NEON/functions/NEGEMM.cpp
@@ -140,7 +140,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
 
     if(c != nullptr)
     {
-        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 1);
+        ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.depth_output_gemm3d() != 0);
         ARM_COMPUTE_RETURN_ERROR_ON(gemm_info.reinterpret_input_as_3d());
         ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(a, c);
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(a->dimension(1) != c->dimension(1), "The C matrix must have the same number of rows as the matrix A");
@@ -150,7 +150,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     if(output->total_size() != 0)
     {
         ARM_COMPUTE_RETURN_ERROR_ON(b->dimension(0) != output->dimension(0));
-        if(gemm_info.depth_output_gemm3d() != 1)
+        if(gemm_info.depth_output_gemm3d() != 0)
         {
             if(gemm_info.reinterpret_input_as_3d())
             {
@@ -174,7 +174,7 @@ Status NEGEMM::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso
     if(!run_optimised)
     {
         ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMM cannot reinterpret the input tensor as 3D");
-        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMM cannot reinterpret the output tensor as 3D");
+        ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMM cannot reinterpret the output tensor as 3D");
 
         // Check if the first input tensor is a vector.
         const bool run_vector_matrix_multiplication = a->dimension(1) < 2;
diff --git a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
index fc65469488..02ae1715da 100644
--- a/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
+++ b/src/runtime/NEON/functions/NEGEMMConvolutionLayer.cpp
@@ -276,7 +276,9 @@ void NEGEMMConvolutionLayer::configure(const ITensor *input, const ITensor *weig
     }
 
     // Configure GEMM
-    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, _skip_col2im ? conv_h : 1);
+    // In case we need to skip col2im, GEMM3D (gemm_3d_depth != 0) must be called in order to avoid reshaping the output matrix
+    const unsigned int gemm_3d_depth = _skip_col2im ? conv_h : 0;
+    configure_mm(gemm_input_to_use, &_weights_reshaped, gemm_output_to_use, gemm_3d_depth);
 
     if(!_skip_im2col)
     {
@@ -477,7 +479,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
         gemm_output_to_use = &info_gemm;
     }
 
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 1, skip_im2col));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, gemm_output_to_use, skip_col2im ? conv_h : 0, skip_im2col));
 
     if(is_quantized)
     {
@@ -518,7 +520,7 @@ Status NEGEMMConvolutionLayer::validate(const ITensorInfo *input, const ITensorI
         }
 
         // Validate output stage for quantized case
-        NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min, max, skip_reshape ? conv_h : 1);
+        NEGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(gemm_output_to_use, biases, gemm_output_staged_to_use, min, max, skip_reshape ? conv_h : 0);
     }
 
     // Validate Col2Im/ReshapeLayer
diff --git a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
index 80f5ab0c93..16ee3d07fd 100644
--- a/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
+++ b/src/runtime/NEON/functions/NEGEMMLowpMatrixMultiplyCore.cpp
@@ -198,7 +198,7 @@ Status NEGEMMLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported");
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.reinterpret_input_as_3d(), "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the input tensor as 3D");
-    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 1, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D");
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.depth_output_gemm3d() != 0, "NEGEMMLowpMatrixMultiplyCore cannot reinterpret the output tensor as 3D");
 
     int32_t a_offset                         = a->quantization_info().offset;
     int32_t b_offset                         = b->quantization_info().offset;
diff --git a/tests/datasets/SmallConvolutionLayerDataset.h b/tests/datasets/SmallConvolutionLayerDataset.h
index bbfc760bf3..df9196fb77 100644
--- a/tests/datasets/SmallConvolutionLayerDataset.h
+++ b/tests/datasets/SmallConvolutionLayerDataset.h
@@ -139,6 +139,7 @@ public:
     {
         add_config(TensorShape(224U, 224U, 3U), TensorShape(3U, 3U, 3U, 32U), TensorShape(32U), TensorShape(112U, 112U, 32U),
                    PadStrideInfo(2, 2, /*left*/ 0, /*right*/ 1, /*top*/ 0, /*bottom*/ 1, DimensionRoundingType::FLOOR));
+
         // Batch size 1
         add_config(TensorShape(23U, 27U, 5U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(11U, 25U, 21U), PadStrideInfo(2, 1, 0, 0));
         add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 5U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U), PadStrideInfo(3, 2, 1, 0));
@@ -146,6 +147,7 @@ public:
         add_config(TensorShape(23U, 27U, 5U), TensorShape(3U, 1U, 5U, 21U), TensorShape(21U), TensorShape(11U, 27U, 21U), PadStrideInfo(2, 1, 0, 0));
         add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 11U, 16U), PadStrideInfo(3, 2, 1, 0));
         add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 3U, 2U, 19U), TensorShape(19U), TensorShape(15U, 16U, 19U), PadStrideInfo(1, 2, 1, 1));
+        add_config(TensorShape(3U, 3U, 1U), TensorShape(2U, 2U, 1U, 11U), TensorShape(11U), TensorShape(2U, 2U, 11U), PadStrideInfo(1, 1, 0, 0));
         // Batch size 4
         add_config(TensorShape(23U, 27U, 5U, 4U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(11U, 25U, 21U, 4U), PadStrideInfo(2, 1, 0, 0));
         add_config(TensorShape(33U, 27U, 7U, 4U), TensorShape(5U, 5U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 4U), PadStrideInfo(3, 2, 1, 0));
@@ -164,9 +166,26 @@ public:
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 3, 0, 2, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 1, 0, 1, 0, DimensionRoundingType::FLOOR));
         add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(10U, 11U, 16U, 5U), PadStrideInfo(3, 2, 0, 1, 0, 1, DimensionRoundingType::FLOOR));
-        // TODO (micgio01) - COMPMID-1604: investigate issue in GLES and re-enable the following dataset
-        // Single output channel
-        //add_config(TensorShape(5U, 4U, 3U, 2U), TensorShape(4U, 4U, 3U, 1U), TensorShape(1U), TensorShape(2U, 1U, 1U, 2U), PadStrideInfo(1, 1, 0, 0, 0, 0, DimensionRoundingType::FLOOR));
+
+        add_config(TensorShape(5U, 4U, 3U, 2U), TensorShape(4U, 4U, 3U, 1U), TensorShape(1U), TensorShape(2U, 1U, 1U, 2U), PadStrideInfo(1, 1, 0, 0, 0, 0, DimensionRoundingType::FLOOR));
+    }
+};
+
+// TODO (COMPMID-1749)
+class SmallConvolutionLayerReducedDataset final : public ConvolutionLayerDataset
+{
+public:
+    SmallConvolutionLayerReducedDataset()
+    {
+        // Batch size 1
+        add_config(TensorShape(23U, 27U, 5U), TensorShape(3U, 3U, 5U, 21U), TensorShape(21U), TensorShape(11U, 25U, 21U), PadStrideInfo(2, 1, 0, 0));
+        add_config(TensorShape(33U, 27U, 7U), TensorShape(5U, 5U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U), PadStrideInfo(3, 2, 1, 0));
+        add_config(TensorShape(17U, 31U, 2U), TensorShape(5U, 5U, 2U, 19U), TensorShape(19U), TensorShape(15U, 15U, 19U), PadStrideInfo(1, 2, 1, 1));
+
+        // Asymmetric padding
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 2, 0, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 1, 1, 0, 2, DimensionRoundingType::FLOOR));
+        add_config(TensorShape(33U, 27U, 7U, 5U), TensorShape(5U, 7U, 7U, 16U), TensorShape(16U), TensorShape(11U, 12U, 16U, 5U), PadStrideInfo(3, 2, 2, 1, 2, 0, DimensionRoundingType::FLOOR));
     }
 };
 
diff --git a/tests/validation/CL/ConvolutionLayer.cpp b/tests/validation/CL/ConvolutionLayer.cpp
index 0274bed977..5f10b4b9bc 100644
--- a/tests/validation/CL/ConvolutionLayer.cpp
+++ b/tests/validation/CL/ConvolutionLayer.cpp
@@ -268,11 +268,18 @@ const auto QuantizedActivationFunctionsDataset = framework::dataset::make("Activ
 TEST_SUITE(Quantized)
 TEST_SUITE(QASYMM8)
 
+const auto QuantizationData = framework::dataset::make("QuantizationInfo",
+{
+    QuantizationInfo(0.5f, 10),
+    QuantizationInfo(0.3f, 3),
+    QuantizationInfo(1.f, 10),
+});
+
 FIXTURE_DATA_TEST_CASE(RunSmall, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 10) })),
+                       QuantizationData),
                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
@@ -282,7 +289,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, CLGEMMConvolutionLayerQuantizedFixture<uint8_t>
                        framework::dataset::make("ReshapeWeights", { true })),
                        framework::dataset::make("DataType", DataType::QASYMM8)),
                        framework::dataset::make("DataLayout", { DataLayout::NCHW, DataLayout::NHWC })),
-                       framework::dataset::make("QuantizationInfo", { QuantizationInfo(2.f / 255.f, 0) })),
+                       QuantizationData),
                        QuantizedActivationFunctionsDataset))
 {
     // Validate output
diff --git a/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp
index 2961dc9519..85a477b293 100644
--- a/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp
+++ b/tests/validation/GLES_COMPUTE/ConvolutionLayer.cpp
@@ -65,7 +65,7 @@ const auto ActivationFunctionsDataset = framework::dataset::make("ActivationInfo
 TEST_SUITE(GC)
 TEST_SUITE(ConvolutionLayer)
 
-DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallConvolutionLayerDataset(), datasets::LargeConvolutionLayerDataset()),
+DATA_TEST_CASE(Configuration, framework::DatasetMode::ALL, combine(combine(framework::dataset::concat(datasets::SmallConvolutionLayerReducedDataset(), datasets::LargeConvolutionLayerDataset()),
                                                                            CNNDataTypes),
                                                                    ActivationFunctionsDataset),
                input_shape, weights_shape, bias_shape, output_shape, info, dilation, data_type, act_info)
@@ -114,7 +114,7 @@ using GCConvolutionLayerFixture = ConvolutionValidationFixture<GCTensor, GCAcces
 
 TEST_SUITE(Float)
 TEST_SUITE(FP16)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture<half>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerReducedDataset(),
                                                                                                                      framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                      framework::dataset::make("DataType",
                                                                                                                              DataType::F16)),
@@ -139,7 +139,7 @@ FIXTURE_DATA_TEST_CASE(RunLarge, GCConvolutionLayerFixture<half>, framework::Dat
 TEST_SUITE_END()
 
 TEST_SUITE(FP32)
-FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerDataset(),
+FIXTURE_DATA_TEST_CASE(RunSmall, GCConvolutionLayerFixture<float>, framework::DatasetMode::PRECOMMIT, combine(combine(combine(combine(datasets::SmallConvolutionLayerReducedDataset(),
                                                                                                                       framework::dataset::make("ReshapeWeights", { true })),
                                                                                                                       framework::dataset::make("DataType", DataType::F32)),
                                                                                                                       framework::dataset::make("DataLayout",
diff --git a/tests/validation/fixtures/GEMMFixture.h b/tests/validation/fixtures/GEMMFixture.h
index 255b12c0ed..74205e25d4 100644
--- a/tests/validation/fixtures/GEMMFixture.h
+++ b/tests/validation/fixtures/GEMMFixture.h
@@ -85,9 +85,9 @@ protected:
         // Create and configure function
         FunctionType gemm;
         // The GEMMinfo includes the values of the depth in case of reinterpreted 3d output.
-        // If the output shape has the same number of dimensions of the input the method called is a 2D matrix multiplication (depth_output_reinterpreted_as_3D = 1),
+        // If the output shape has the same number of dimensions of the input the method called is a 2D matrix multiplication (depth_output_reinterpreted_as_3D = 0),
         // in the other case we have to use the reinterpreted version of GEMM (depth_output_reinterpreted_as_3D = depth of the 3D output).
-        gemm.configure(&a, &b, &c, &dst, alpha, beta, GEMMInfo(false, false, false, (reinterpret_ouput_as_3d ? output_shape[2] : 1), reinterpret_input_as_3d));
+        gemm.configure(&a, &b, &c, &dst, alpha, beta, GEMMInfo(false, false, false, (reinterpret_ouput_as_3d ? output_shape[2] : 0), reinterpret_input_as_3d));
         ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(c.info()->is_resizable(), framework::LogLevel::ERRORS);
diff --git a/tests/validation/fixtures/GEMMLowpFixture.h b/tests/validation/fixtures/GEMMLowpFixture.h
index b61b4eca38..96debe0eec 100644
--- a/tests/validation/fixtures/GEMMLowpFixture.h
+++ b/tests/validation/fixtures/GEMMLowpFixture.h
@@ -76,7 +76,7 @@ protected:
         // The GEMMinfo includes the values of the depth in case of reinterpreted 3d input/output
         FunctionType gemmlowp;
         // TODO (COMPMID-1672) - Extending the test to validate add bias in offset contribution
-        gemmlowp.configure(&a, &b, nullptr, &c, GEMMInfo(false, false, false, (reinterpret_output_as_3d ? shape_c[2] : 1), reinterpret_input_as_3d));
+        gemmlowp.configure(&a, &b, nullptr, &c, GEMMInfo(false, false, false, (reinterpret_output_as_3d ? shape_c[2] : 0), reinterpret_input_as_3d));
 
         ARM_COMPUTE_EXPECT(a.info()->is_resizable(), framework::LogLevel::ERRORS);
         ARM_COMPUTE_EXPECT(b.info()->is_resizable(), framework::LogLevel::ERRORS);
author	Gian Marco Iodice <gianmarco.iodice@arm.com>	2018-11-05 14:26:32 +0000
committer	Gian Marco Iodice <gianmarco.iodice@arm.com>	2018-11-08 13:31:33 +0000
commit	3139f03a74ede3b3bd7cfc6ff219e6c9bc556632 (patch)
tree	cd67b8c9ec1688fcf739c576553b10e435e6938e
parent	421405b6a21b124288a750e2da26dc01eb7391cb (diff)
download	ComputeLibrary-3139f03a74ede3b3bd7cfc6ff219e6c9bc556632.tar.gz