aboutsummaryrefslogtreecommitdiff
path: root/src/core/CL/cl_kernels
diff options
context:
space:
mode:
authorAnton Lokhmotov <psyhtest@users.noreply.github.com>2017-11-08 09:34:19 +0000
committerAnthony Barbier <anthony.barbier@arm.com>2018-11-02 16:35:24 +0000
commitaf6204c331eed7894ec4c5fd4e98ec22b6dac676 (patch)
tree7c952fbcf3ec11ff4c9d49061be12742e9cd979d /src/core/CL/cl_kernels
parent3a873a578e80481a55ce3b885078948ae79468eb (diff)
downloadComputeLibrary-af6204c331eed7894ec4c5fd4e98ec22b6dac676.tar.gz
COMPMID-661: Add avgpool-uint8 support. Optimize avgpool-fp32 for Bifrost. (#13)
Change-Id: I32ba6afbac6694ffa053dd16f03a1b3d14627a19 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/94857 Tested-by: Kaizen <jeremy.johnson+kaizengerrit@arm.com> Reviewed-by: Anthony Barbier <anthony.barbier@arm.com>
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl101
-rw-r--r--src/core/CL/cl_kernels/pooling_layer_quantized.cl121
2 files changed, 123 insertions, 99 deletions
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
index 635c44a849..ee8ff27ab7 100644
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ b/src/core/CL/cl_kernels/pooling_layer.cl
@@ -375,7 +375,7 @@ calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upp
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
*/
-__kernel void pooling_layer_3_optimized(
+__kernel void pooling_layer_optimized_3(
TENSOR3D_DECLARATION(input),
TENSOR3D_DECLARATION(output))
{
@@ -403,103 +403,6 @@ __kernel void pooling_layer_3_optimized(
}
#endif // defined(POOLING3x3) && !defined(FIXED_POINT_POSITION)
-/** Performs a pooling function of pool size equal to 7.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are QS8/QS16/F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: QS8/QS16/F16/F32
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_7(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- // Load data
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data0 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data1 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data2 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data3 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 3, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data4 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 4, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data5 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 5, 0));
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data6 = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 6, 0));
-
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 = POW2_OP(data0, 8);
- data1 = POW2_OP(data1, 8);
- data2 = POW2_OP(data2, 8);
- data3 = POW2_OP(data3, 8);
- data4 = POW2_OP(data4, 8);
- data5 = POW2_OP(data5, 8);
- data6 = POW2_OP(data6, 8);
-#endif /* defined(POOL_L2) */
-
- // Pool operation of all rows
- data0 = POOL_OP(data0, data1);
- data2 = POOL_OP(data2, data3);
- data4 = POOL_OP(data4, data5);
- data0 = POOL_OP(data0, data2);
- data4 = POOL_OP(data4, data6);
- data0 = POOL_OP(data0, data4);
-
- // Set last element
-#if defined(POOL_AVG) || defined(POOL_L2)
- data0.s7 = 0;
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
- data0.s7 = data0.s6;
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
- // Reduce result
- VEC_DATA_TYPE(DATA_TYPE, 4)
- reduce4 = POOL_OP(data0.s0123, data0.s4567);
- VEC_DATA_TYPE(DATA_TYPE, 2)
- reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
- DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
- // Divide by pool region in case of average pooling
- res = DIV_OP(res, calculate_avg_scale(7, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
- // Store result
- *(__global DATA_TYPE *)output.ptr = res;
-}
-
#if defined(POOL_SIZE)
// Set the initial value for the pooling operation accordingly with the data type
@@ -608,4 +511,4 @@ __kernel void pooling_layer_N(
// Store result
*(__global DATA_TYPE *)output.ptr = res;
}
-#endif // defined(POOL_SIZE) \ No newline at end of file
+#endif // defined(POOL_SIZE)
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..17448d19de
--- /dev/null
+++ b/src/core/CL/cl_kernels/pooling_layer_quantized.cl
@@ -0,0 +1,121 @@
+/*
+ * Copyright (c) 2017 ARM Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(0) * stride_x - pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size, upper_bound_w);
+ const int end_y = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N
+ *
+ * @note Pool size must be passed using -DPOOL_SIZE e.g. -DPOOL_SIZE=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_N_quantized(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int8 vdata = 0;
+ int sdata = 0;
+
+ // Load data
+ for(int y = 0; y < POOL_SIZE; y++)
+ {
+ int x = 0;
+ for(; x <= ((int)POOL_SIZE - 8); x += 8)
+ {
+ uchar8 data = vload8(0, (__global uchar *)tensor3D_offset(&input, x, y, 0));
+ int8 data0 = convert_int8(data);
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < (int)POOL_SIZE; ++x)
+ {
+ uchar data = *((__global uchar *)tensor3D_offset(&input, x, y, 0));
+ int data0 = convert_int(data);
+ sdata = POOL_OP(sdata, data0);
+ }
+ }
+
+ // Reduce result
+ int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+ int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ int res = POOL_OP(reduce2.s0, reduce2.s1);
+ res = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+ res = DIV_OP(res, calculate_avg_scale(POOL_SIZE, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) */
+
+ // Store result
+ *(__global uchar *)output.ptr = convert_uchar(res);
+}