From 0c54a62f334b6cfdca99066d8de3ed6a0b2fa15e Mon Sep 17 00:00:00 2001
From: Gian Marco Iodice <gianmarco.iodice@arm.com>
Date: Tue, 30 Oct 2018 12:20:03 +0000
Subject: COMPMID-1451: Removed output_depth3d from
 CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat

Since we perform an element-wise operation, it is not necessary to pass the output_depth3d.

Change-Id: Ibfa07a0706e902acf59b444aa61e18a348162ea9
---
 ...owpQuantizeDownInt32ToUint8ScaleByFloatKernel.h | 41 ++++++-------
 .../runtime/CL/functions/CLGEMMLowpOutputStage.h   | 38 ++++++------
 src/core/CL/cl_kernels/gemmlowp.cl                 | 36 ++++++------
 ...tizeDownInt32ToUint8ScaleByFixedPointKernel.cpp |  7 +--
 ...pQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp | 68 +++++++---------------
 src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp |  8 +--
 6 files changed, 80 insertions(+), 118 deletions(-)

diff --git a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h
index 5a5d3938b7..7256095c03 100644
--- a/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h
+++ b/arm_compute/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.h
@@ -58,34 +58,30 @@ public:
     CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &operator=(CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel &&) = default;
     /** Initialise the kernel's input and output.
      *
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  multiplier      Float multiplier to be multiplied to each element of the input matrix
-     * @param[in]  offset          Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     * @param[in]  output_3d_depth (Optional) Depth of output in 3D (Defaults to 1)
+     * @param[in]  input      Input tensor. Data type supported: S32
+     * @param[in]  bias       Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                        Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output     Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  multiplier Float multiplier to be multiplied to each element of the input matrix
+     * @param[in]  offset     Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min        (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max        (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset,
-                   int min = 0, int max = 0, unsigned int output_3d_depth = 1);
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = 0, int max = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel
      *
-     * @param[in] input           Input tensor. Data type supported: S32
-     * @param[in] bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output          Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     * @param[in] output_3d_depth (Optional)  Depth of output in 3D (Defaults to 1)
+     * @param[in] input  Input tensor. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                           int min = 0, int max = 0, unsigned int output_3d_depth = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
 
     // Inherited methods overridden:
     void run(const Window &window, cl::CommandQueue &queue) override;
@@ -94,7 +90,6 @@ private:
     const ICLTensor *_input;
     const ICLTensor *_bias;
     ICLTensor       *_output;
-    bool             _reinterpret_as_3d;
 };
 } // namespace arm_compute
 #endif /* __ARM_COMPUTE_CLGEMMLOWPQUANTIZEDOWNINT32TOUINT8SCALEBYFLOATKERNEL_H__ */
diff --git a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
index 3330b40d8a..cfd1f08519 100644
--- a/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
+++ b/arm_compute/runtime/CL/functions/CLGEMMLowpOutputStage.h
@@ -163,32 +163,30 @@ class CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat : public ICLSimpleFunction
 public:
     /** Initialise the kernel's inputs, output
      *
-     * @param[in]  input           Input tensor. Data type supported: S32
-     * @param[in]  bias            Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
-     *                             Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[out] output          Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in]  multiplier      Float multiplier to be multiplied to each element of the input matrix
-     * @param[in]  offset          Offset to be applied to result before converting it back to QASYMM8
-     * @param[in]  min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in]  max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                             Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     * @param[in]  output_3d_depth (Optional) Depth of output in 3D (Defaults to 1)
+     * @param[in]  input      Input tensor. Data type supported: S32
+     * @param[in]  bias       Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required.
+     *                        Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[out] output     Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in]  multiplier Float multiplier to be multiplied to each element of the input matrix
+     * @param[in]  offset     Offset to be applied to result before converting it back to QASYMM8
+     * @param[in]  min        (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in]  max        (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                        Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      */
-    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = 0, int max = 0, unsigned int output_3d_depth = 1);
+    void configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output, float multiplier, int offset, int min = 0, int max = 0);
     /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint
      *
-     * @param[in] input           Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
-     * @param[in] bias            Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
-     *                            Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
-     * @param[in] output          Output tensor. Data type supported: Data type supported: QASYMM8
-     * @param[in] min             (Optional) Min value used to saturate down the output result before converting back to QASYMM8
-     * @param[in] max             (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
-     *                            Along with @p min, this value can be used to implement "rectified linear unit" activation functions
-     * @param[in] output_3d_depth (Optional) Depth of output in 3D (Defaults to 1)
+     * @param[in] input  Input tensor. It is the output of @ref CLGEMMLowpMatrixMultiplyCore function. Data type supported: S32
+     * @param[in] bias   Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required.
+     *                   Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input.
+     * @param[in] output Output tensor. Data type supported: Data type supported: QASYMM8
+     * @param[in] min    (Optional) Min value used to saturate down the output result before converting back to QASYMM8
+     * @param[in] max    (Optional) Max value used to saturate up the output result before converting back to QASYMM8,
+     *                   Along with @p min, this value can be used to implement "rectified linear unit" activation functions
      *
      * @return a status
      */
-    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0, unsigned int output_3d_depth = 1);
+    static Status validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min = 0, int max = 0);
 };
 } // namespace arm_compute
 #endif /*__ARM_COMPUTE_CLGEMMLOWPOUTPUTSTAGE_H__ */
\ No newline at end of file
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/gemmlowp.cl
index 35e0d9dba5..f2467b721a 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/gemmlowp.cl
@@ -710,7 +710,7 @@ __kernel void gemmlowp_mm_interleaved_transposed_bifrost_dot8(IMAGE_DECLARATION(
     {
         // Load values from matrix A (interleaved) and matrix B (transposed)
         uchar16 a0 = vload16(0, src_addr_a + (i_left_over % 4) + ((i_left_over / 4) * 16));
-        uchar4 b0  = vload4(0, src_addr_b);
+        uchar4  b0 = vload4(0, src_addr_b);
 
         c00 += a0.s0 * b0.s0;
         c01 += a0.s0 * b0.s1;
@@ -3225,40 +3225,38 @@ __kernel void gemmlowp_output_stage_quantize_down_float(TENSOR3D_DECLARATION(src
 #endif // defined(DST_HEIGHT)
 {
     // Compute source and destination addresses
-    Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
-#if defined(DST_HEIGHT)
-    Tensor4D dst = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(dst, 1);
-    dst.ptr += get_global_id(0) * dst_step_x + (get_global_id(1) % DST_HEIGHT) * dst_step_y + (get_global_id(1) / DST_HEIGHT) * dst_step_z + get_global_id(2) * dst_step_w;
-#else  // defined(DST_HEIGHT)
-    Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-#endif // defined(DST_HEIGHT)
+    int x = get_global_id(0) * 4;
+    int y = get_global_id(1);
+    int z = get_global_id(2);
 
-#if defined(ADD_BIAS)
-    Vector biases = CONVERT_TO_VECTOR_STRUCT(biases);
-#endif // defined(ADD_BIAS)
+    __global uchar *src_addr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(int) + y * src_stride_y + z * src_stride_z;
 
-    int16 input_values = vload16(0, (__global int *)src.ptr);
+    __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x + y * dst_stride_y + z * dst_stride_z;
+
+    int4 input_values = vload4(0, (__global int *)src_addr);
 
 #if defined(ADD_BIAS)
     // Add bias
-    const int16 biases_values = vload16(0, (__global int *)biases.ptr);
-    input_values += (int16)biases_values;
+    __global uchar *bias_addr = biases_ptr + biases_offset_first_element_in_bytes + x * sizeof(int);
+
+    int4 biases_values = vload4(0, (__global int *)bias_addr);
+    input_values += (int4)biases_values;
 #endif // defined(ADD_BIAS)
 
     // Convert to float
-    float16 input_values_f = convert_float16(input_values);
+    float16 input_values_f = convert_float4(input_values);
     input_values_f         = round(input_values_f * (float)REAL_MULTIPLIER + (float)OUTPUT_OFFSET);
 
-    uchar16 res = convert_uchar16_sat(input_values_f);
+    uchar4 res = convert_uchar4_sat(input_values_f);
 
 #if defined(MIN_BOUND)
-    res = max(res, (uchar16)MIN_BOUND);
+    res = max(res, (uchar4)MIN_BOUND);
 #endif // defined(MIN_BOUND)
 #if defined(MAX_BOUND)
-    res = min(res, (uchar16)MAX_BOUND);
+    res = min(res, (uchar4)MAX_BOUND);
 #endif // defined(MAX_BOUND)
 
     // Store the result
-    vstore16(res, 0, dst.ptr);
+    vstore4(res, 0, dst_addr);
 }
 #endif // defined(REAL_MULTIPLIER) && defined(OUTPUT_OFFSET)
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
index 38e0474dde..b7eff0f8ec 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel.cpp
@@ -69,6 +69,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen
 {
     constexpr unsigned int num_elems_processed_per_iteration = 4;
 
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
+
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
 
@@ -124,10 +127,6 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPointKernel::configure(const
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-
-    // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8));
-
     ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(),
                                                   min, max));
 
diff --git a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp
index f0096bd3ad..b7730d5060 100644
--- a/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp
+++ b/src/core/CL/kernels/CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel.cpp
@@ -42,7 +42,7 @@ namespace arm_compute
 namespace
 {
 Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                          int min, int max, unsigned int output_3d_depth)
+                          int min, int max)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::S32);
     ARM_COMPUTE_RETURN_ERROR_ON(max > 255);
@@ -58,10 +58,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 
     if(output->total_size() != 0)
     {
-        const TensorShape output_shape       = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input, output_3d_depth, true);
-        const TensorInfo  tensor_info_output = output->clone()->set_tensor_shape(output_shape);
         ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::QASYMM8);
-        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output);
+        ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(input, output);
     }
 
     return Status{};
@@ -69,7 +67,10 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con
 
 std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output)
 {
-    constexpr unsigned int num_elems_processed_per_iteration = 16;
+    constexpr unsigned int num_elems_processed_per_iteration = 4;
+
+    // Output auto inizialitation if not yet initialized
+    auto_init_if_empty(*output, input->clone()->set_data_type(DataType::QASYMM8));
 
     // Configure kernel window
     Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration));
@@ -103,15 +104,14 @@ class Coordinates;
 } // namespace arm_compute
 
 CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel()
-    : _input(nullptr), _bias(nullptr), _output(nullptr), _reinterpret_as_3d(false)
+    : _input(nullptr), _bias(nullptr), _output(nullptr)
 {
 }
 
-Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                      int min, int max, unsigned int output_3d_depth)
+Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, int min, int max)
 {
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max, output_3d_depth));
+    ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(input, bias, output, min, max));
     ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(input->clone().get(),
                                                               (bias != nullptr) ? bias->clone().get() : nullptr,
                                                               output->clone().get())
@@ -122,22 +122,15 @@ Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(const ITen
 
 void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                      float multiplier, int offset,
-                                                                     int min, int max, unsigned int output_3d_depth)
+                                                                     int min, int max)
 {
     // Perform validate step
     ARM_COMPUTE_ERROR_ON_NULLPTR(input, output);
+    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(), min, max));
 
-    // Output auto inizialitation if not yet initialized
-    const TensorShape output_shape = arm_compute::misc::shape_calculator::compute_output_stage_shape(*input->info(), output_3d_depth, true);
-    auto_init_if_empty(*output->info(), input->info()->clone()->set_data_type(DataType::QASYMM8).set_tensor_shape(output_shape));
-
-    ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(input->info(), (bias != nullptr) ? bias->info() : nullptr, output->info(),
-                                                  min, max, output_3d_depth));
-
-    _input             = input;
-    _bias              = bias;
-    _output            = output;
-    _reinterpret_as_3d = output_3d_depth > 1;
+    _input  = input;
+    _bias   = bias;
+    _output = output;
 
     // Set the arguments to pass at compile time
     CLBuildOptions build_opts;
@@ -146,7 +139,6 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::configure(const ICLTe
     build_opts.add_option_if((min != 0) && (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min));
     build_opts.add_option_if((max != 255) && (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max));
     build_opts.add_option_if(bias != nullptr, "-DADD_BIAS");
-    build_opts.add_option_if(_reinterpret_as_3d, "-DDST_HEIGHT=" + support::cpp11::to_string(input->info()->tensor_shape().y() / output_3d_depth));
 
     // Create kernel
     _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("gemmlowp_output_stage_quantize_down_float", build_opts.options()));
@@ -176,32 +168,12 @@ void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::run(const Window &win
         add_1D_tensor_argument(idx1, _bias, biases_slice);
     }
 
-    if(_reinterpret_as_3d)
-    {
-        // Create output window
-        Window window_out;
-        window_out.use_tensor_dimensions(_output->info()->tensor_shape());
-        Window collapsed_out = window_out.collapse_if_possible(window_out, 3);
-        Window slice_out     = collapsed.first_slice_window_4D();
-
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, slice);
-            add_4D_tensor_argument(idx1, _output, slice_out);
-            enqueue(queue, *this, slice);
-        }
-        while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out));
-    }
-    else
+    do
     {
-        do
-        {
-            unsigned int idx = 0;
-            add_3D_tensor_argument(idx, _input, slice);
-            add_3D_tensor_argument(idx1, _output, slice);
-            enqueue(queue, *this, slice);
-        }
-        while(collapsed.slide_window_slice_3D(slice));
+        unsigned int idx = 0;
+        add_3D_tensor_argument(idx, _input, slice);
+        add_3D_tensor_argument(idx1, _output, slice);
+        enqueue(queue, *this, slice);
     }
+    while(collapsed.slide_window_slice_3D(slice));
 }
diff --git a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
index f1c24626dc..f1282cbde9 100644
--- a/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
+++ b/src/runtime/CL/functions/CLGEMMLowpOutputStage.cpp
@@ -60,16 +60,16 @@ Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFixedPoint::validate(const ITens
 
 void CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::configure(const ICLTensor *input, const ICLTensor *bias, ICLTensor *output,
                                                                float multiplier, int offset,
-                                                               int min, int max, unsigned int output_3d_depth)
+                                                               int min, int max)
 {
     auto k = arm_compute::support::cpp14::make_unique<CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel>();
-    k->configure(input, bias, output, multiplier, offset, min, max, output_3d_depth);
+    k->configure(input, bias, output, multiplier, offset, min, max);
     _kernel = std::move(k);
 }
 
 Status CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloat::validate(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output,
-                                                                int min, int max, unsigned int output_3d_depth)
+                                                                int min, int max)
 {
-    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max, output_3d_depth);
+    return CLGEMMLowpQuantizeDownInt32ToUint8ScaleByFloatKernel::validate(input, bias, output, min, max);
 }
 } // namespace arm_compute
\ No newline at end of file
-- 
cgit v1.2.1