3 files changed, 64 insertions, 51 deletions
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 5bfc832518..adedebba53 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -101,7 +101,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
         build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
 
         // Set scale and offset of the input and output
-        if(is_data_type_quantized_assymetric(dt))
+        if(is_data_type_quantized_asymmetric(dt))
         {
             float s1 = input->info()->quantization_info().scale;
             int   o1 = input->info()->quantization_info().offset;
@@ -127,7 +127,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
     }
 
     // Create kernel
-    std::string kernel_name = is_data_type_quantized_assymetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
+    std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
     _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
 
     // Make sure _kernel is initialized before calling the parent's configure
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 53e46390c1..5f109f76af 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -84,7 +84,12 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     output_shape.set(2, weights->info()->dimension(3));
 
     // Output auto inizialitation if not yet initialized
-    auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+    auto_init_if_empty(*output->info(),
+                       output_shape,
+                       1,
+                       input->info()->data_type(),
+                       input->info()->fixed_point_position(),
+                       input->info()->quantization_info());
 
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
     ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -176,7 +181,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     else
     {
         bool     is_quantized_fixed_point = is_data_type_fixed_point(data_type);
-        bool     is_quantized_asymm       = is_data_type_quantized_assymetric(data_type);
+        bool     is_quantized_asymm       = is_data_type_quantized_asymmetric(data_type);
         DataType promoted_type            = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
 
         build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
@@ -220,7 +225,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
     }
 
     // Set static kernel arguments
-    if(is_data_type_quantized_assymetric(data_type))
+    if(is_data_type_quantized_asymmetric(data_type))
     {
         int output_multiplier = 0;
         int output_shift      = 0;
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 2854cd8265..1317278fb5 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -26,6 +26,7 @@
 #include "arm_compute/core/AccessWindowStatic.h"
 #include "arm_compute/core/CL/CLHelpers.h"
 #include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
 #include "arm_compute/core/CL/ICLTensor.h"
 #include "arm_compute/core/CL/OpenCL.h"
 #include "arm_compute/core/Helpers.h"
@@ -80,7 +81,12 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
         output_shape.set(0, pooled_w);
         output_shape.set(1, pooled_h);
 
-        auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+        auto_init_if_empty(*output->info(),
+                           output_shape,
+                           1,
+                           input->info()->data_type(),
+                           input->info()->fixed_point_position(),
+                           input->info()->quantization_info());
     }
 
     ARM_COMPUTE_ERROR_THROW_ON(CLPoolingLayerKernel::validate(input->info(), output->info(), pool_info));
@@ -94,80 +100,80 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
     _pool_info   = pool_info;
     _border_size = BorderSize(pool_pad_y, pool_pad_x);
 
-    // Set build options
-    std::set<std::string> build_opts;
-    build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
-    build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
-    if(is_data_type_fixed_point(input->info()->data_type()))
-    {
-        build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
-    }
+    const GPUTarget gpu_target = get_arch_from_target(get_target());
+    const DataType  data_type  = input->info()->data_type();
 
-    build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
+    // Set build options
+    CLBuildOptions build_opts;
+    build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+    build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+    build_opts.add_option_if(is_data_type_fixed_point(data_type),
+                             "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+    build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
     if(pool_type != PoolingType::MAX)
     {
-        if(exclude_padding)
-        {
-            build_opts.emplace("-DEXCLUDE_PADDING");
-        }
-        build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x))));
-        build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y))));
-        build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)));
-        build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x)));
-        build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y)));
+        build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+        build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x)));
+        build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y)));
+        build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+        build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_x));
+        build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y));
     }
 
     // Create kernel
-    if((pool_size == 2) || (pool_size == 3) || (pool_size == 7))
+    if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
     {
         // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
         // each thread computes 4 output elements
-        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
+        const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
 
-        int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
+        int num_elems_read_per_iteration = pool_size;
         if(is_pool3x3_stride_le3)
         {
-            // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+            // Change the number of elements processed and the number of elements read per iteration
+            // for pooling 3x3 with stride less equal than 3
             _num_elems_processed_per_iteration = 4;
-            num_elements_read_per_iteration    = pool_size * (pool_stride_x + 1);
+            num_elems_read_per_iteration       = pool_size * (pool_stride_x + 1);
         }
 
-        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
+        const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
         const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
         _border_size.right  = std::max(upper_bound_w, pool_pad_x);
         _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
 
-        std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
-        if(is_pool3x3_stride_le3)
-        {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
-        }
-        else
-        {
-            _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
-        }
+        std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+                                  + support::cpp11::to_string(pool_size);
+        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
     else // Run general case
     {
-        _num_elems_processed_per_iteration = 1;
-
         const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
         const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
 
         _border_size.right  = std::max(upper_bound_w, pool_pad_x);
         _border_size.bottom = std::max(upper_bound_h, pool_pad_y);
 
-        build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)));
-        if(input->info()->data_type() == DataType::F16)
-        {
-            build_opts.emplace("-DFP16");
-        }
-        _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts));
+        build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size));
+        build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
+
+        std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_N_quantized" : "pooling_layer_N";
+        _kernel                 = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
     }
 
     // Configure kernel window
-    Window                 win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+    Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+
+    // Configure the local work size (hint) from the first two dimensions of the global work size.
+    // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
+    // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
+    // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
+    if(gpu_target == GPUTarget::BIFROST)
+    {
+        cl::NDRange gws = ICLKernel::gws_from_window(win);
+        _lws_hint       = cl::NDRange(gws[0], gws[1], 1);
+    }
+
     AccessWindowStatic     input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
     AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
     update_window_and_padding(win, input_access, output_access);
@@ -178,14 +184,16 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
 Error CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
 {
     ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
-    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+    ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
+                                    "Unsupported combination of parameters!");
 
     int pool_pad_x = 0;
     int pool_pad_y = 0;
     int pool_size  = pool_info.pool_size();
     std::tie(pool_pad_x, pool_pad_y) = pool_info.pad_stride_info().pad();
     ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_pad_x >= pool_size) || (pool_pad_y >= pool_size)),
-                                    "Invalid pool size and pool pad combination");
+                                    "Invalid pool size and pool pad combination!");
 
     // Checks performed when output is configured
     if(output->total_size() != 0)
@@ -230,7 +238,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
         unsigned int idx = 0;
         add_3D_tensor_argument(idx, _input, in_slice);
         add_3D_tensor_argument(idx, _output, slice);
-        enqueue(queue, *this, slice);
+        enqueue(queue, *this, slice, _lws_hint);
     }
     while(window_collapsed.slide_window_slice_3D(slice));
 }