aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorAnton Lokhmotov <psyhtest@users.noreply.github.com>2017-11-08 09:34:19 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitaf6204c331eed7894ec4c5fd4e98ec22b6dac676 (patch)
tree7c952fbcf3ec11ff4c9d49061be12742e9cd979d /src
parent3a873a578e80481a55ce3b885078948ae79468eb (diff)
downloadComputeLibrary-af6204c331eed7894ec4c5fd4e98ec22b6dac676.tar.gz
COMPMID-661: Add avgpool-uint8 support. Optimize avgpool-fp32 for Bifrost. (#13)
Change-Id: I32ba6afbac6694ffa053dd16f03a1b3d14627a19 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94857 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/CLKernelLibrary.cpp7
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl101
-rw-r--r--src/core/CL/cl_kernels/pooling_layer_quantized.cl121
-rw-r--r--src/core/CL/kernels/CLActivationLayerKernel.cpp4
-rw-r--r--src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp11
-rw-r--r--src/core/CL/kernels/CLPoolingLayerKernel.cpp100
-rw-r--r--src/runtime/CL/functions/CLPoolingLayer.cpp14
-rw-r--r--src/runtime/CL/functions/CLSoftmaxLayer.cpp2
8 files changed, 207 insertions, 153 deletions
diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp
index 6efeebd63f..6ebdf298f1 100644
--- a/src/core/CL/CLKernelLibrary.cpp
+++ b/src/core/CL/CLKernelLibrary.cpp
@@ -271,9 +271,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map =
{ "pixelwise_mul_int", "pixelwise_mul_int.cl" },
{ "pooling_layer_2", "pooling_layer.cl" },
{ "pooling_layer_3", "pooling_layer.cl" },
- { "pooling_layer_3_optimized", "pooling_layer.cl" },
+ { "pooling_layer_optimized_3", "pooling_layer.cl" },
{ "pooling_layer_7", "pooling_layer.cl" },
{ "pooling_layer_N", "pooling_layer.cl" },
+ { "pooling_layer_N_quantized", "pooling_layer_quantized.cl" },
{ "quantization_layer", "quantization_layer.cl" },
{ "reduction_operation", "reduction_operation.cl" },
{ "remap_nearest_neighbour", "remap.cl" },
@@ -546,6 +547,10 @@ const std::map<std::string, std::string> CLKernelLibrary::_program_source_map =
#include "./cl_kernels/pooling_layer.clembed"
},
{
+ "pooling_layer_quantized.cl",
+#include "./cl_kernels/pooling_layer_quantized.clembed"
+ },
+ {
"quantization_layer.cl",
#include "./cl_kernels/quantization_layer.clembed"
},
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 635c44a849..ee8ff27ab7 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -375,7 +375,7 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
-__kernel void pooling_layer_3_optimized(
+__kernel void pooling_layer_optimized_3(
TENSOR3D_DECLARATION(input),
TENSOR3D_DECLARATION(output))
{
@@ -403,103 +403,6 @@ __kernel void pooling_layer_3_optimized(
}
#endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_7(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- // Load data
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 = POW2_OP(data0, 8);
- data1 = POW2_OP(data1, 8);
- data2 = POW2_OP(data2, 8);
- data3 = POW2_OP(data3, 8);
- data4 = POW2_OP(data4, 8);
- data5 = POW2_OP(data5, 8);
- data6 = POW2_OP(data6, 8);
-#endif /* defined(POOL_L2) */
-
- // Pool operation of all rows
- data0 = POOL_OP(data0, data1);
- data2 = POOL_OP(data2, data3);
- data4 = POOL_OP(data4, data5);
- data0 = POOL_OP(data0, data2);
- data4 = POOL_OP(data4, data6);
- data0 = POOL_OP(data0, data4);
-
- // Set last element
-#if defined(POOL_AVG) || defined(POOL_L2)
- data0.s7 = 0;
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
- data0.s7 = data0.s6;
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
- // Reduce result
- VEC_DATA_TYPE(DATA_TYPE, 4)
- reduce4 = POOL_OP(data0.s0123, data0.s4567);
- VEC_DATA_TYPE(DATA_TYPE, 2)
- reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
- DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
- // Divide by pool region in case of average pooling
- res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
- // Store result
- *(__global DATA_TYPE *)output.ptr = res;
-}
-
#if defined(POOL_SIZE)
// Set the initial value for the pooling operation accordingly with the data type
@@ -608,4 +511,4 @@ __kernel void pooling_layer_N(
// Store result
*(__global DATA_TYPE *)output.ptr = res;
}
-#endif // defined(POOL_SIZE) \ No newline at end of file
+#endif // defined(POOL_SIZE)
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..17448d19de
--- /dev/null
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(0) * stride_x - pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size, upper_bound_w);
+ const int end_y = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_N_quantized(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int8 vdata = 0;
+ int sdata = 0;
+
+ // Load data
+ for(int y = 0; y < POOL_SIZE; y++)
+ {
+ int x = 0;
+ for(; x <= ((int)POOL_SIZE - 8); x += 8)
+ {
+ uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0));
+ int8 data0 = convert_int8(data);
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < (int)POOL_SIZE; ++x)
+ {
+ uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0));
+ int data0 = convert_int(data);
+ sdata = POOL_OP(sdata, data0);
+ }
+ }
+
+ // Reduce result
+ int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+ int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ int res = POOL_OP(reduce2.s0, reduce2.s1);
+ res = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+ res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) */
+
+ // Store result
+ *(__global uchar *)output.ptr = convert_uchar(res);
+}
diff --git a/src/core/CL/kernels/CLActivationLayerKernel.cpp b/src/core/CL/kernels/CLActivationLayerKernel.cpp
index 5bfc832518..adedebba53 100644
--- a/src/core/CL/kernels/CLActivationLayerKernel.cpp
+++ b/src/core/CL/kernels/CLActivationLayerKernel.cpp
@@ -101,7 +101,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
build_opts.emplace(("-DB_VAL=" + support::cpp11::to_string(b_const_int)));
// Set scale and offset of the input and output
- if(is_data_type_quantized_assymetric(dt))
+ if(is_data_type_quantized_asymmetric(dt))
{
float s1 = input->info()->quantization_info().scale;
int o1 = input->info()->quantization_info().offset;
@@ -127,7 +127,7 @@ void CLActivationLayerKernel::configure(ICLTensor *input, ICLTensor *output, Act
}
// Create kernel
- std::string kernel_name = is_data_type_quantized_assymetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
+ std::string kernel_name = is_data_type_quantized_asymmetric(dt) ? std::string("activation_layer_qa8") : std::string("activation_layer");
_kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
// Make sure _kernel is initialized before calling the parent's configure
diff --git a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
index 53e46390c1..5f109f76af 100644
--- a/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
+++ b/src/core/CL/kernels/CLDirectConvolutionLayerKernel.cpp
@@ -84,7 +84,12 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
output_shape.set(2, weights->info()->dimension(3));
// Output auto inizialitation if not yet initialized
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(),
+ output_shape,
+ 1,
+ input->info()->data_type(),
+ input->info()->fixed_point_position(),
+ input->info()->quantization_info());
ARM_COMPUTE_ERROR_ON_MISMATCHING_DIMENSIONS(output->info()->tensor_shape(), output_shape);
ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input, output);
@@ -176,7 +181,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
else
{
bool is_quantized_fixed_point = is_data_type_fixed_point(data_type);
- bool is_quantized_asymm = is_data_type_quantized_assymetric(data_type);
+ bool is_quantized_asymm = is_data_type_quantized_asymmetric(data_type);
DataType promoted_type = (is_quantized_fixed_point) ? get_promoted_data_type(data_type) : data_type;
build_options.add_option_if(is_quantized_asymm, std::string("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)));
@@ -220,7 +225,7 @@ void CLDirectConvolutionLayerKernel::configure(const ICLTensor *input, const ICL
}
// Set static kernel arguments
- if(is_data_type_quantized_assymetric(data_type))
+ if(is_data_type_quantized_asymmetric(data_type))
{
int output_multiplier = 0;
int output_shift = 0;
diff --git a/src/core/CL/kernels/CLPoolingLayerKernel.cpp b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
index 2854cd8265..1317278fb5 100644
--- a/src/core/CL/kernels/CLPoolingLayerKernel.cpp
+++ b/src/core/CL/kernels/CLPoolingLayerKernel.cpp
@@ -26,6 +26,7 @@
#include "arm_compute/core/AccessWindowStatic.h"
#include "arm_compute/core/CL/CLHelpers.h"
#include "arm_compute/core/CL/CLKernelLibrary.h"
+#include "arm_compute/core/CL/ICLKernel.h"
#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/OpenCL.h"
#include "arm_compute/core/Helpers.h"
@@ -80,7 +81,12 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
output_shape.set(0, pooled_w);
output_shape.set(1, pooled_h);
- auto_init_if_empty(*output->info(), output_shape, 1, input->info()->data_type(), input->info()->fixed_point_position());
+ auto_init_if_empty(*output->info(),
+ output_shape,
+ 1,
+ input->info()->data_type(),
+ input->info()->fixed_point_position(),
+ input->info()->quantization_info());
}
ARM_COMPUTE_ERROR_THROW_ON(CLPoolingLayerKernel::validate(input->info(), output->info(), pool_info));
@@ -94,80 +100,80 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
_pool_info = pool_info;
_border_size = BorderSize(pool_pad_y, pool_pad_x);
- // Set build options
- std::set<std::string> build_opts;
- build_opts.emplace(("-DDATA_TYPE=" + get_cl_type_from_data_type(input->info()->data_type())));
- build_opts.emplace(("-DPOOL_" + string_from_pooling_type(pool_type)));
- if(is_data_type_fixed_point(input->info()->data_type()))
- {
- build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
- }
+ const GPUTarget gpu_target = get_arch_from_target(get_target());
+ const DataType data_type = input->info()->data_type();
- build_opts.emplace(("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)));
+ // Set build options
+ CLBuildOptions build_opts;
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type));
+ build_opts.add_option_if(is_data_type_fixed_point(data_type),
+ "-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input->info()->fixed_point_position()));
+ build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x));
if(pool_type != PoolingType::MAX)
{
- if(exclude_padding)
- {
- build_opts.emplace("-DEXCLUDE_PADDING");
- }
- build_opts.emplace(("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x))));
- build_opts.emplace(("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y))));
- build_opts.emplace(("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)));
- build_opts.emplace(("-DPAD_X=" + support::cpp11::to_string(pool_pad_x)));
- build_opts.emplace(("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y)));
+ build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING");
+ build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(input->info()->dimension(0) + (exclude_padding ? 0 : pool_pad_x)));
+ build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(input->info()->dimension(1) + (exclude_padding ? 0 : pool_pad_y)));
+ build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y));
+ build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_x));
+ build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_y));
}
// Create kernel
- if((pool_size == 2) || (pool_size == 3) || (pool_size == 7))
+ if((pool_size == 3) && !is_data_type_quantized_asymmetric(data_type))
{
// Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where
// each thread computes 4 output elements
- const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(input->info()->data_type());
+ const bool is_pool3x3_stride_le3 = (pool_size == 3) && (pool_stride_x <= 3) && !is_data_type_fixed_point(data_type);
- int num_elements_read_per_iteration = (pool_size == 7) ? 8 : pool_size;
+ int num_elems_read_per_iteration = pool_size;
if(is_pool3x3_stride_le3)
{
- // Change the number of elements processed and number of elements read per iteration for pooling 3x3 with stride less equal than 3
+ // Change the number of elements processed and the number of elements read per iteration
+ // for pooling 3x3 with stride less equal than 3
_num_elems_processed_per_iteration = 4;
- num_elements_read_per_iteration = pool_size * (pool_stride_x + 1);
+ num_elems_read_per_iteration = pool_size * (pool_stride_x + 1);
}
- const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elements_read_per_iteration) - input_width;
+ const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + num_elems_read_per_iteration) - input_width;
const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
_border_size.right = std::max(upper_bound_w, pool_pad_x);
_border_size.bottom = std::max(upper_bound_h, pool_pad_y);
- std::string kernel_name = "pooling_layer_" + support::cpp11::to_string(pool_size);
- if(is_pool3x3_stride_le3)
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name + "_optimized", build_opts));
- }
- else
- {
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts));
- }
+ std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_")
+ + support::cpp11::to_string(pool_size);
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
}
else // Run general case
{
- _num_elems_processed_per_iteration = 1;
-
const int upper_bound_w = ((pooled_w - 1) * pool_stride_x - pool_pad_x + pool_size) - input_width;
const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_y + pool_size) - input_height;
_border_size.right = std::max(upper_bound_w, pool_pad_x);
_border_size.bottom = std::max(upper_bound_h, pool_pad_y);
- build_opts.emplace(("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size)));
- if(input->info()->data_type() == DataType::F16)
- {
- build_opts.emplace("-DFP16");
- }
- _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("pooling_layer_N", build_opts));
+ build_opts.add_option("-DPOOL_SIZE=" + support::cpp11::to_string(pool_size));
+ build_opts.add_option_if(data_type == DataType::F16, "-DFP16");
+
+ std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_N_quantized" : "pooling_layer_N";
+ _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options()));
}
// Configure kernel window
- Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+ Window win = calculate_max_window(*output->info(), Steps(_num_elems_processed_per_iteration));
+
+ // Configure the local work size (hint) from the first two dimensions of the global work size.
+ // On Bifrost, this works for up to 35x35xC filters, for which the pooling_layer_3_optimized
+ // kernel is launched with gws=(9, 33, C). In any case, the hint will be ignored if it is
+ // invalid (e.g. exceeds the maximum workgroup size that the kernel can be launched with).
+ if(gpu_target == GPUTarget::BIFROST)
+ {
+ cl::NDRange gws = ICLKernel::gws_from_window(win);
+ _lws_hint = cl::NDRange(gws[0], gws[1], 1);
+ }
+
AccessWindowStatic input_access(input->info(), -pool_pad_x, -pool_pad_y, input_width + _border_size.right, input_height + _border_size.bottom);
AccessWindowHorizontal output_access(output->info(), 0, _num_elems_processed_per_iteration);
update_window_and_padding(win, input_access, output_access);
@@ -178,14 +184,16 @@ void CLPoolingLayerKernel::configure(const ICLTensor *input, ICLTensor *output,
Error CLPoolingLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
{
ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output);
- ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QS8, DataType::QS16, DataType::F16, DataType::F32);
+ ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(input->data_type()) && pool_info.pool_type() == PoolingType::L2),
+ "Unsupported combination of parameters!");
int pool_pad_x = 0;
int pool_pad_y = 0;
int pool_size = pool_info.pool_size();
std::tie(pool_pad_x, pool_pad_y) = pool_info.pad_stride_info().pad();
ARM_COMPUTE_RETURN_ERROR_ON_MSG(((pool_pad_x >= pool_size) || (pool_pad_y >= pool_size)),
- "Invalid pool size and pool pad combination");
+ "Invalid pool size and pool pad combination!");
// Checks performed when output is configured
if(output->total_size() != 0)
@@ -230,7 +238,7 @@ void CLPoolingLayerKernel::run(const Window &window, cl::CommandQueue &queue)
unsigned int idx = 0;
add_3D_tensor_argument(idx, _input, in_slice);
add_3D_tensor_argument(idx, _output, slice);
- enqueue(queue, *this, slice);
+ enqueue(queue, *this, slice, _lws_hint);
}
while(window_collapsed.slide_window_slice_3D(slice));
}
diff --git a/src/runtime/CL/functions/CLPoolingLayer.cpp b/src/runtime/CL/functions/CLPoolingLayer.cpp
index 6ca1a33b0c..ac360fbb3d 100644
--- a/src/runtime/CL/functions/CLPoolingLayer.cpp
+++ b/src/runtime/CL/functions/CLPoolingLayer.cpp
@@ -23,21 +23,33 @@
*/
#include "arm_compute/runtime/CL/functions/CLPoolingLayer.h"
+#include "arm_compute/core/CL/ICLTensor.h"
#include "arm_compute/core/CL/kernels/CLPoolingLayerKernel.h"
+#include "arm_compute/runtime/CL/CLScheduler.h"
#include "support/ToolchainSupport.h"
using namespace arm_compute;
void CLPoolingLayer::configure(ICLTensor *input, ICLTensor *output, const PoolingLayerInfo &pool_info)
{
+ ARM_COMPUTE_ERROR_ON_NULLPTR(input);
+
// Configure pooling kernel
auto k = arm_compute::support::cpp14::make_unique<CLPoolingLayerKernel>();
+ k->set_target(CLScheduler::get().target());
k->configure(input, output, pool_info);
_kernel = std::move(k);
// Configure border depending on operation required
BorderMode border_mode = (PoolingType::MAX == pool_info.pool_type()) ? BorderMode::REPLICATE : BorderMode::CONSTANT;
- _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(0));
+ // Quantize border in case data type is quantized asymmetric data type
+ uint32_t border_value = 0;
+ if(is_data_type_quantized_asymmetric(input->info()->data_type()) && !pool_info.exclude_padding())
+ {
+ border_value = static_cast<uint32_t>(input->info()->quantization_info().quantize(0));
+ }
+
+ _border_handler.configure(input, _kernel->border_size(), border_mode, PixelValue(border_value));
}
Error CLPoolingLayer::validate(const ITensorInfo *input, const ITensorInfo *output, const PoolingLayerInfo &pool_info)
diff --git a/src/runtime/CL/functions/CLSoftmaxLayer.cpp b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
index 7268d8eab5..a059f9e5fd 100644
--- a/src/runtime/CL/functions/CLSoftmaxLayer.cpp
+++ b/src/runtime/CL/functions/CLSoftmaxLayer.cpp
@@ -62,7 +62,7 @@ void CLSoftmaxLayer::configure(const ICLTensor *input, ICLTensor *output, float
// Configure kernels
// TODO (COMPMID-661): Remove legacy path once the new one is properly validated
- _run_legacy_path = is_data_type_quantized_assymetric(input->info()->data_type());
+ _run_legacy_path = is_data_type_quantized_asymmetric(input->info()->data_type());
if(_run_legacy_path)
{
_max_kernel.configure(input, &_max);