diff options
Diffstat (limited to 'src')
-rw-r--r-- | src/core/CL/cl_kernels/batch_to_space.cl | 232 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/batchnormalization_layer.cl | 418 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/activation_layer.cl (renamed from src/core/CL/cl_kernels/activation_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/activation_layer_quant.cl (renamed from src/core/CL/cl_kernels/activation_layer_quant.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/arg_min_max.cl (renamed from src/core/CL/cl_kernels/arg_min_max.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/batchnormalization_layer.cl | 183 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/bitwise_op.cl (renamed from src/core/CL/cl_kernels/bitwise_op.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/bounding_box_transform.cl (renamed from src/core/CL/cl_kernels/bounding_box_transform.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl (renamed from src/core/CL/cl_kernels/bounding_box_transform_quantized.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/cast.cl (renamed from src/core/CL/cl_kernels/cast.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/col2im.cl (renamed from src/core/CL/cl_kernels/col2im.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/comparisons.cl (renamed from src/core/CL/cl_kernels/comparisons.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/concatenate.cl (renamed from src/core/CL/cl_kernels/concatenate.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/convert_fc_weights.cl (renamed from src/core/CL/cl_kernels/convert_fc_weights.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/convolution_layer.cl (renamed from src/core/CL/cl_kernels/convolution_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/copy_tensor.cl (renamed from src/core/CL/cl_kernels/copy_tensor.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/crop_tensor.cl (renamed from src/core/CL/cl_kernels/crop_tensor.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/deconvolution_layer.cl (renamed from src/core/CL/cl_kernels/deconvolution_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/dequantization_layer.cl | 90 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/elementwise_operation.cl (renamed from src/core/CL/cl_kernels/elementwise_operation.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl (renamed from src/core/CL/cl_kernels/elementwise_operation_quantized.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/elementwise_unary.cl (renamed from src/core/CL/cl_kernels/elementwise_unary.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/fft.cl (renamed from src/core/CL/cl_kernels/fft.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/fft_digit_reverse.cl (renamed from src/core/CL/cl_kernels/fft_digit_reverse.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/fft_scale.cl (renamed from src/core/CL/cl_kernels/fft_scale.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/fill_border.cl (renamed from src/core/CL/cl_kernels/fill_border.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/floor.cl (renamed from src/core/CL/cl_kernels/floor.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/gather.cl (renamed from src/core/CL/cl_kernels/gather.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/gemm.cl (renamed from src/core/CL/cl_kernels/gemm.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/gemm_v1.cl (renamed from src/core/CL/cl_kernels/gemm_v1.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/gemmlowp.cl (renamed from src/core/CL/cl_kernels/gemmlowp.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/gemv.cl (renamed from src/core/CL/cl_kernels/gemv.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/generate_proposals.cl (renamed from src/core/CL/cl_kernels/generate_proposals.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/generate_proposals_quantized.cl (renamed from src/core/CL/cl_kernels/generate_proposals_quantized.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/instance_normalization.cl (renamed from src/core/CL/cl_kernels/instance_normalization.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/l2_normalize.cl (renamed from src/core/CL/cl_kernels/l2_normalize.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/mean_stddev_normalization.cl (renamed from src/core/CL/cl_kernels/mean_stddev_normalization.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/memset.cl (renamed from src/core/CL/cl_kernels/memset.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/minmax_layer.cl (renamed from src/core/CL/cl_kernels/minmax_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/nonmax.cl (renamed from src/core/CL/cl_kernels/nonmax.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/pad_layer.cl (renamed from src/core/CL/cl_kernels/pad_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/permute.cl (renamed from src/core/CL/cl_kernels/permute.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/pixelwise_mul_float.cl (renamed from src/core/CL/cl_kernels/pixelwise_mul_float.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/pixelwise_mul_int.cl (renamed from src/core/CL/cl_kernels/pixelwise_mul_int.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/pooling_layer.cl | 390 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl (renamed from src/core/CL/cl_kernels/qlstm_layer_normalization.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/quantization_layer.cl (renamed from src/core/CL/cl_kernels/quantization_layer.cl) | 6 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/range.cl (renamed from src/core/CL/cl_kernels/range.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/reduction_operation.cl (renamed from src/core/CL/cl_kernels/reduction_operation.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/reshape_layer.cl (renamed from src/core/CL/cl_kernels/reshape_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/reverse.cl (renamed from src/core/CL/cl_kernels/reverse.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/roi_align_layer.cl (renamed from src/core/CL/cl_kernels/roi_align_layer.cl) | 6 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl (renamed from src/core/CL/cl_kernels/roi_align_layer_quantized.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/roi_pooling_layer.cl (renamed from src/core/CL/cl_kernels/roi_pooling_layer.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/select.cl (renamed from src/core/CL/cl_kernels/select.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/slice_ops.cl (renamed from src/core/CL/cl_kernels/slice_ops.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/softmax_layer.cl (renamed from src/core/CL/cl_kernels/softmax_layer.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/softmax_layer_quantized.cl (renamed from src/core/CL/cl_kernels/softmax_layer_quantized.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/stack_layer.cl (renamed from src/core/CL/cl_kernels/stack_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/tile.cl (renamed from src/core/CL/cl_kernels/tile.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/transpose.cl (renamed from src/core/CL/cl_kernels/transpose.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/common/unpooling_layer.cl (renamed from src/core/CL/cl_kernels/unpooling_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/dequantization_layer.cl | 212 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/batch_to_space.cl (renamed from src/core/CL/cl_kernels/depth_to_space.cl) | 74 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl | 147 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/channel_shuffle.cl | 103 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/depth_to_space.cl | 69 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/dequantization_layer.cl | 86 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl (renamed from src/core/CL/cl_kernels/direct_convolution1x1.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl (renamed from src/core/CL/cl_kernels/direct_convolution3x3.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl (renamed from src/core/CL/cl_kernels/direct_convolution5x5.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl (renamed from src/core/CL/cl_kernels/direct_convolution_quantized.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/im2col.cl (renamed from src/core/CL/cl_kernels/im2col.cl) | 501 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/normalization_layer.cl | 175 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl | 82 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl | 101 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/pooling_layer.cl | 331 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl | 142 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/prior_box_layer.cl (renamed from src/core/CL/cl_kernels/prior_box_layer.cl) | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/remap.cl | 133 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/reorg_layer.cl (renamed from src/core/CL/cl_kernels/reorg_layer.cl) | 43 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/scale.cl | 148 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/scale_quantized.cl | 86 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/space_to_batch.cl (renamed from src/core/CL/cl_kernels/space_to_batch.cl) | 126 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/space_to_depth.cl | 69 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/upsample_layer.cl | 79 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl | 911 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/winograd_input_transform.cl (renamed from src/core/CL/cl_kernels/winograd_input_transform.cl) | 887 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nchw/winograd_output_transform.cl (renamed from src/core/CL/cl_kernels/winograd_output_transform.cl) | 981 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/batch_to_space.cl (renamed from src/core/CL/cl_kernels/space_to_depth.cl) | 82 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl | 146 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/channel_shuffle.cl (renamed from src/core/CL/cl_kernels/channel_shuffle.cl) | 64 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/depth_to_space.cl | 69 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/dequantization_layer.cl | 87 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/direct_convolution.cl (renamed from src/core/CL/cl_kernels/direct_convolution.cl) | 0 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl (renamed from src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl) | 44 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl (renamed from src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl) | 76 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/im2col.cl | 532 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/normalization_layer.cl (renamed from src/core/CL/cl_kernels/normalization_layer.cl) | 154 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl (renamed from src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl) | 55 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl (renamed from src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl) | 72 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/pooling_layer.cl | 364 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl (renamed from src/core/CL/cl_kernels/pooling_layer_quantized.cl) | 104 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/remap.cl (renamed from src/core/CL/cl_kernels/remap.cl) | 112 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/reorg_layer.cl | 76 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/scale.cl (renamed from src/core/CL/cl_kernels/scale.cl) | 141 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/scale_quantized.cl (renamed from src/core/CL/cl_kernels/scale_quantized.cl) | 81 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/space_to_batch.cl | 155 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/space_to_depth.cl | 69 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/upsample_layer.cl (renamed from src/core/CL/cl_kernels/upsample_layer.cl) | 59 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl (renamed from src/core/CL/cl_kernels/winograd_filter_transform.cl) | 881 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl | 953 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl | 1030 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/pooling_layer.cl | 981 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/sobel_filter.cl | 541 | ||||
-rw-r--r-- | src/core/gpu/cl/ClKernelLibrary.cpp | 1052 |
116 files changed, 7617 insertions, 7240 deletions
diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl deleted file mode 100644 index 8a71985b02..0000000000 --- a/src/core/CL/cl_kernels/batch_to_space.cl +++ /dev/null @@ -1,232 +0,0 @@ -/* - * Copyright (c) 2018-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#if defined(DATA_TYPE) && defined(BATCH_SIZE) -/** Batch to space transformation. (NCHW) - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[in] batch_id The input tensor batch id - * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void batch_to_space_nchw( - TENSOR3D_DECLARATION(input), - const int batch_id, - VECTOR_DECLARATION(block_shape), - TENSOR4D_DECLARATION(output)) -{ - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); - - const int block_x = *((__global int *)vector_offset(&block, 0)); - const int block_y = *((__global int *)vector_offset(&block, 1)); - - const int r = (BATCH_SIZE / (block_x * block_y)); - const int x = get_global_id(0); - const int y = get_global_id(1); - const int z = get_global_id(2); - const int w = batch_id % r; - - const int out_x = x * block_x + (batch_id / r) % block_x; - const int out_y = y * block_y + (batch_id / r) / block_x; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr); -} -/** Batch to space transformation. (NHWC) - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[in] batch_id The input tensor batch id - * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32 - * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void batch_to_space_nhwc( - TENSOR3D_DECLARATION(input), - const int batch_id, - VECTOR_DECLARATION(block_shape), - TENSOR4D_DECLARATION(output)) -{ - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); - - const int block_x = *((__global int *)vector_offset(&block, 0)); - const int block_y = *((__global int *)vector_offset(&block, 1)); - - const int r = (BATCH_SIZE / (block_x * block_y)); - const int x = get_global_id(1); - const int y = get_global_id(2); - const int z = get_global_id(0); - const int w = batch_id % r; - - const int out_x = x * block_x + (batch_id / r) % block_x; - const int out_y = y * block_y + (batch_id / r) / block_x; - - *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) - -#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) -/** Batch to space transformation. (NCHW) - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 - * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 - * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[in] batch_id The input tensor batch id - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void batch_to_space_static_nchw( - TENSOR3D_DECLARATION(input), - const int batch_id, - TENSOR4D_DECLARATION(output)) -{ - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - const int block_x = BLOCK_SHAPE_X; - const int block_y = BLOCK_SHAPE_Y; - - const int r = (BATCH_SIZE / (block_x * block_y)); - const int x = get_global_id(0); - const int y = get_global_id(1); - const int z = get_global_id(2); - const int w = batch_id % r; - - const int out_x = x * block_x + (batch_id / r) % block_x; - const int out_y = y * block_y + (batch_id / r) / block_x; - - *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr); -} -/** Batch to space transformation. (NHWC) - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 - * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 - * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[in] batch_id The input tensor batch id - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void batch_to_space_static_nhwc( - TENSOR3D_DECLARATION(input), - const int batch_id, - TENSOR4D_DECLARATION(output)) -{ - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - - const int block_x = BLOCK_SHAPE_X; - const int block_y = BLOCK_SHAPE_Y; - - const int r = (BATCH_SIZE / (block_x * block_y)); - const int x = get_global_id(1); - const int y = get_global_id(2); - const int z = get_global_id(0); - const int w = batch_id % r; - - const int out_x = x * block_x + (batch_id / r) % block_x; - const int out_y = y * block_y + (batch_id / r) / block_x; - - *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr); -} -#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl deleted file mode 100644 index 89cbe4440e..0000000000 --- a/src/core/CL/cl_kernels/batchnormalization_layer.cl +++ /dev/null @@ -1,418 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#define ADD_OP(a, b) ((a) + (b)) -#define SUB_OP(a, b) ((a) - (b)) -#define MUL_OP(a, b) ((a) * (b)) -#define INVSQRT_OP(a) rsqrt((a)) -#define SQCVT_SAT(a) (a) - -#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE) -#include "activation_float_helpers.h" - -/** Apply batch normalization. - * - * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu - * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr - * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) - * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor - * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr - * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) - * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor - * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr - * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) - * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor - * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr - * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) - * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor - * @param[in] epsilon Epsilon parameter in the batch normalization equation - */ -__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input), -#ifndef IN_PLACE - TENSOR3D_DECLARATION(output), -#endif /* not IN_PLACE */ - VECTOR_DECLARATION(mean), - VECTOR_DECLARATION(var), -#ifndef USE_DEFAULT_BETA - VECTOR_DECLARATION(beta), -#endif /* USE_DEFAULT_BETA */ -#ifndef USE_DEFAULT_GAMMA - VECTOR_DECLARATION(gamma), -#endif /* USE_DEFAULT_GAMMA */ - float epsilon) -{ - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); -#ifdef IN_PLACE - Tensor3D out = in; -#else /* IN_PLACE */ - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); -#endif /* IN_PLACE */ - Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); - Vector var = CONVERT_TO_VECTOR_STRUCT(var); -#ifndef USE_DEFAULT_BETA - Vector beta = CONVERT_TO_VECTOR_STRUCT(beta); -#endif /* USE_DEFAULT_BETA */ -#ifndef USE_DEFAULT_GAMMA - Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma); -#endif /* USE_DEFAULT_GAMMA */ - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - data = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - denominator = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - numerator = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - x_bar = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - res = 0; - - const int current_slice = get_global_id(2); - - data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr); - denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x)); - denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon)))); - - // Calculate x bar and store results - numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x)); - numerator = SUB_OP(data, numerator); - x_bar = MUL_OP(numerator, denominator); - -#ifndef USE_DEFAULT_GAMMA - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x)); - - res = MUL_OP(gamma_vec, x_bar); -#else /* USE_DEFAULT_GAMMA */ - // gamma is equal to 1, no need to perform multiplications - res = x_bar; -#endif /* USE_DEFAULT_GAMMA */ - -#ifndef USE_DEFAULT_BETA - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x)); - // beta is not zero, hence we need to perform the addition - res = ADD_OP(res, beta_vec); -#endif /* USE_DEFAULT_BETA */ - - res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL); - - VSTORE(VEC_SIZE) - (res, 0, (__global DATA_TYPE *)out.ptr); -} - -/** Apply batch normalization on tensors with NHWC format. - * - * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu - * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr - * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) - * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor - * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr - * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) - * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor - * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr - * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) - * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor - * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr - * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) - * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor - * @param[in] epsilon Epsilon parameter in the batch normalization equation - */ -__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input), -#ifndef IN_PLACE - TENSOR3D_DECLARATION(output), -#endif /* not IN_PLACE */ - VECTOR_DECLARATION(mean), - VECTOR_DECLARATION(var), -#ifndef USE_DEFAULT_BETA - VECTOR_DECLARATION(beta), -#endif /* USE_DEFAULT_BETA */ -#ifndef USE_DEFAULT_GAMMA - VECTOR_DECLARATION(gamma), -#endif /* USE_DEFAULT_GAMMA */ - float epsilon) -{ - uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); - - __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; -#ifdef IN_PLACE - __global uchar *output_addr = input_ptr; -#else /* IN_PLACE */ - __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; -#endif /* IN_PLACE */ - __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs; - __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs; -#ifndef USE_DEFAULT_BETA - __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs; -#endif /* USE_DEFAULT_BETA */ -#ifndef USE_DEFAULT_GAMMA - __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs; -#endif /* USE_DEFAULT_GAMMA */ - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - data = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - denominator = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - numerator = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - x_bar = 0; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - res0 = 0; - - data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr); - denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr); - denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon)))); - - // Calculate x bar and store results - numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr); - numerator = SUB_OP(data, numerator); - x_bar = MUL_OP(numerator, denominator); - -#ifndef USE_DEFAULT_GAMMA - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr); - - res0 = MUL_OP(gamma_vec, x_bar); -#else /* USE_DEFAULT_GAMMA */ - // gamma is equal to 1, no need to perform multiplications - res0 = x_bar; -#endif /* USE_DEFAULT_GAMMA */ - -#ifndef USE_DEFAULT_BETA - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr); - // beta is not zero, hence we need to perform the addition - res0 = ADD_OP(res0, beta_vec); -#endif /* USE_DEFAULT_BETA */ - - res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL); - - STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) -} -#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/ - -#if defined(DATA_TYPE) && defined(EPSILON) -/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC - * - * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension - * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float - * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16. - * For depthwise convolution weight do not pass DIM2 - * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter - * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f - * - * @param[in] w_ptr Pointer to the weights tensor. Supported data types: F16/F32 - * @param[in] w_stride_x Stride of the weights tensor in X dimension (in bytes) - * @param[in] w_step_x w_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] w_stride_y Stride of the weights tensor in Y dimension (in bytes) - * @param[in] w_step_y w_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] w_stride_z Stride of the weights tensor in Z dimension (in bytes) - * @param[in] w_step_z w_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] w_offset_first_element_in_bytes The offset of the first element in the weights tensor - * @param[in] b_ptr (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr - * @param[in] b_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) - * @param[in] b_step_x (Optional) b_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] b_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) - * @param[in] b_step_y (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] b_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) - * @param[in] b_step_z (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] b_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor - * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p w_ptr - * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) - * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor - * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p w_ptr - * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) - * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor - * @param[out] w_fused_ptr (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr - * @param[in] w_fused_stride_x (Optional) Stride of the destination weights tensor in X dimension (in bytes) - * @param[in] w_fused_step_x (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] w_fused_stride_y (Optional) Stride of the destination weights tensor in Y dimension (in bytes) - * @param[in] w_fused_step_y (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] w_fused_stride_z (Optional) Stride of the destination weights tensor in Z dimension (in bytes) - * @param[in] w_fused_step_z (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor - * @param[in] b_fused_ptr (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr - * @param[in] b_fused_stride_x (Optional) Stride of the destination bias tensor in X dimension (in bytes) - * @param[in] b_fused_step_x (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor - * @param[in] beta_ptr (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr - * @param[in] beta_stride_x (Optional) Stride of the beta source tensor in X dimension (in bytes) - * @param[in] beta_step_x (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in the beta source tensor - * @param[in] gamma_ptr (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr - * @param[in] gamma_stride_x (Optional) Stride of the gamma source tensor in X dimension (in bytes) - * @param[in] gamma_step_x (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in the gamma source tensor - */ -__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w), -#if defined(BIAS) - VECTOR_DECLARATION(b), -#endif // defined(BIAS) - VECTOR_DECLARATION(mean), - VECTOR_DECLARATION(var) -#ifndef IN_PLACE_W - , - TENSOR3D_DECLARATION(w_fused) -#endif // ifndef IN_PLACE_W -#ifndef IN_PLACE_B - , - VECTOR_DECLARATION(b_fused) -#endif // ifndef IN_PLACE_B -#if defined(BETA) - , - VECTOR_DECLARATION(beta) -#endif // defined(BETA) -#if defined(GAMMA) - , - VECTOR_DECLARATION(gamma) -#endif // defined(GAMMA) - ) -{ - int x = get_global_id(0); - int y = get_global_id(1); - int z = get_global_id(2); - -#if defined(DIM2) - int c0 = z % DIM2; - int c1 = z / DIM2; -#else // ! defined(DIM2) - int c0 = 0; -#if defined(NHWC) - int c1 = x; -#else // defined(NHWC) - int c1 = z; -#endif // defined(NHWC) -#endif // defined(DIM2) - - int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z; - int v_offset = c1 * sizeof(DATA_TYPE); - - DATA_TYPE w_old = 0.0f; - DATA_TYPE b_old = 0.0f; - DATA_TYPE w_new = 0.0f; - DATA_TYPE b_new = 0.0f; - DATA_TYPE gamma = 1.0f; - DATA_TYPE mean = 0.0f; - DATA_TYPE var = 1.0f; - DATA_TYPE beta = 0.0f; - - w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)); - var = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes)); - mean = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes)); - -#if defined(GAMMA) - gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes)); -#endif // defined(GAMMA) - - // Compute new weight - w_new = (gamma * w_old) / (sqrt(var + EPSILON)); - -#if defined(IN_PLACE_W) - *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new; -#else // defined(IN_PLACE_W) - *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new; -#endif // defined(IN_PLACE_W) - - // Compute bias -#if !defined(DIM2) && defined(NHWC) - if(z == 0 && y == 0) -#else // !defined(DIM2) && defined(NHWC) - if(x == 0 && y == 0 && c0 == 0) -#endif // !defined(DIM2) && defined(NHWC) - { -#if defined(BIAS) - b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)); -#endif // defined(BIAS) -#if defined(BETA) - beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes)); -#endif // defined(BETA) - - b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta; - -#if defined(BIAS) - -#if defined(IN_PLACE_B) - *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new; -#else // defined(IN_PLACE_B) - *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new; -#endif // defined(IN_PLACE_B) - -#else // defined(BIAS) - -#ifndef IN_PLACE_B - *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new; -#endif // ifndef IN_PLACE_B - -#endif // defined(BIAS) - } -} -#endif // defined(DATA_TYPE) && defined(EPSILON)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/common/activation_layer.cl index bc2c99b6c8..a04556a1ed 100644 --- a/src/core/CL/cl_kernels/activation_layer.cl +++ b/src/core/CL/cl_kernels/common/activation_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/common/activation_layer_quant.cl index 66261019ab..38ee00b17a 100644 --- a/src/core/CL/cl_kernels/activation_layer_quant.cl +++ b/src/core/CL/cl_kernels/common/activation_layer_quant.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl index 6e57ed0af1..6e57ed0af1 100644 --- a/src/core/CL/cl_kernels/arg_min_max.cl +++ b/src/core/CL/cl_kernels/common/arg_min_max.cl diff --git a/src/core/CL/cl_kernels/common/batchnormalization_layer.cl b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl new file mode 100644 index 0000000000..18f54907df --- /dev/null +++ b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl @@ -0,0 +1,183 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(EPSILON) +/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC + * + * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension + * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float + * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16. + * For depthwise convolution weight do not pass DIM2 + * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter + * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f + * + * @param[in] w_ptr Pointer to the weights tensor. Supported data types: F16/F32 + * @param[in] w_stride_x Stride of the weights tensor in X dimension (in bytes) + * @param[in] w_step_x w_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] w_stride_y Stride of the weights tensor in Y dimension (in bytes) + * @param[in] w_step_y w_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] w_stride_z Stride of the weights tensor in Z dimension (in bytes) + * @param[in] w_step_z w_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] w_offset_first_element_in_bytes The offset of the first element in the weights tensor + * @param[in] b_ptr (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr + * @param[in] b_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes) + * @param[in] b_step_x (Optional) b_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] b_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes) + * @param[in] b_step_y (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] b_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes) + * @param[in] b_step_z (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] b_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p w_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p w_ptr + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[out] w_fused_ptr (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr + * @param[in] w_fused_stride_x (Optional) Stride of the destination weights tensor in X dimension (in bytes) + * @param[in] w_fused_step_x (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] w_fused_stride_y (Optional) Stride of the destination weights tensor in Y dimension (in bytes) + * @param[in] w_fused_step_y (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] w_fused_stride_z (Optional) Stride of the destination weights tensor in Z dimension (in bytes) + * @param[in] w_fused_step_z (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor + * @param[in] b_fused_ptr (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr + * @param[in] b_fused_stride_x (Optional) Stride of the destination bias tensor in X dimension (in bytes) + * @param[in] b_fused_step_x (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor + * @param[in] beta_ptr (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr + * @param[in] beta_stride_x (Optional) Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in the beta source tensor + * @param[in] gamma_ptr (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr + * @param[in] gamma_stride_x (Optional) Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in the gamma source tensor + */ +__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w), +#if defined(BIAS) + VECTOR_DECLARATION(b), +#endif // defined(BIAS) + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(var) +#ifndef IN_PLACE_W + , + TENSOR3D_DECLARATION(w_fused) +#endif // ifndef IN_PLACE_W +#ifndef IN_PLACE_B + , + VECTOR_DECLARATION(b_fused) +#endif // ifndef IN_PLACE_B +#if defined(BETA) + , + VECTOR_DECLARATION(beta) +#endif // defined(BETA) +#if defined(GAMMA) + , + VECTOR_DECLARATION(gamma) +#endif // defined(GAMMA) + ) +{ + int x = get_global_id(0); + int y = get_global_id(1); + int z = get_global_id(2); + +#if defined(DIM2) + int c0 = z % DIM2; + int c1 = z / DIM2; +#else // ! defined(DIM2) + int c0 = 0; +#if defined(NHWC) + int c1 = x; +#else // defined(NHWC) + int c1 = z; +#endif // defined(NHWC) +#endif // defined(DIM2) + + int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z; + int v_offset = c1 * sizeof(DATA_TYPE); + + DATA_TYPE w_old = 0.0f; + DATA_TYPE b_old = 0.0f; + DATA_TYPE w_new = 0.0f; + DATA_TYPE b_new = 0.0f; + DATA_TYPE gamma = 1.0f; + DATA_TYPE mean = 0.0f; + DATA_TYPE var = 1.0f; + DATA_TYPE beta = 0.0f; + + w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)); + var = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes)); + mean = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes)); + +#if defined(GAMMA) + gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes)); +#endif // defined(GAMMA) + + // Compute new weight + w_new = (gamma * w_old) / (sqrt(var + EPSILON)); + +#if defined(IN_PLACE_W) + *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new; +#else // defined(IN_PLACE_W) + *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new; +#endif // defined(IN_PLACE_W) + + // Compute bias +#if !defined(DIM2) && defined(NHWC) + if(z == 0 && y == 0) +#else // !defined(DIM2) && defined(NHWC) + if(x == 0 && y == 0 && c0 == 0) +#endif // !defined(DIM2) && defined(NHWC) + { +#if defined(BIAS) + b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)); +#endif // defined(BIAS) +#if defined(BETA) + beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes)); +#endif // defined(BETA) + + b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta; + +#if defined(BIAS) + +#if defined(IN_PLACE_B) + *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new; +#else // defined(IN_PLACE_B) + *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new; +#endif // defined(IN_PLACE_B) + +#else // defined(BIAS) + +#ifndef IN_PLACE_B + *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new; +#endif // ifndef IN_PLACE_B + +#endif // defined(BIAS) + } +} +#endif // defined(DATA_TYPE) && defined(EPSILON)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/common/bitwise_op.cl index a600bced9e..e142c1d275 100644 --- a/src/core/CL/cl_kernels/bitwise_op.cl +++ b/src/core/CL/cl_kernels/common/bitwise_op.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/common/bounding_box_transform.cl index f2e9cb0ed0..f2e9cb0ed0 100644 --- a/src/core/CL/cl_kernels/bounding_box_transform.cl +++ b/src/core/CL/cl_kernels/common/bounding_box_transform.cl diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl index c1d45a56b9..c1d45a56b9 100644 --- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl +++ b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl diff --git a/src/core/CL/cl_kernels/cast.cl b/src/core/CL/cl_kernels/common/cast.cl index 036a683ec7..036a683ec7 100644 --- a/src/core/CL/cl_kernels/cast.cl +++ b/src/core/CL/cl_kernels/common/cast.cl diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/common/col2im.cl index 59c2d8a3aa..89054dcb31 100644 --- a/src/core/CL/cl_kernels/col2im.cl +++ b/src/core/CL/cl_kernels/common/col2im.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl index 408846144d..f05cb87835 100644 --- a/src/core/CL/cl_kernels/comparisons.cl +++ b/src/core/CL/cl_kernels/common/comparisons.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/common/concatenate.cl index d2e65408dc..394b20c739 100644 --- a/src/core/CL/cl_kernels/concatenate.cl +++ b/src/core/CL/cl_kernels/common/concatenate.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/common/convert_fc_weights.cl index a451c0213b..01ef04a7d6 100644 --- a/src/core/CL/cl_kernels/convert_fc_weights.cl +++ b/src/core/CL/cl_kernels/common/convert_fc_weights.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/common/convolution_layer.cl index cfd1f12328..be76929ac8 100644 --- a/src/core/CL/cl_kernels/convolution_layer.cl +++ b/src/core/CL/cl_kernels/common/convolution_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2019 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/common/copy_tensor.cl index 9c90969827..753b98d1b0 100644 --- a/src/core/CL/cl_kernels/copy_tensor.cl +++ b/src/core/CL/cl_kernels/common/copy_tensor.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/common/crop_tensor.cl index d9090dc838..d9090dc838 100644 --- a/src/core/CL/cl_kernels/crop_tensor.cl +++ b/src/core/CL/cl_kernels/common/crop_tensor.cl diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/common/deconvolution_layer.cl index b1d5e61476..4ac5e3f0e9 100644 --- a/src/core/CL/cl_kernels/deconvolution_layer.cl +++ b/src/core/CL/cl_kernels/common/deconvolution_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/common/dequantization_layer.cl b/src/core/CL/cl_kernels/common/dequantization_layer.cl new file mode 100644 index 0000000000..7fa62577ce --- /dev/null +++ b/src/core/CL/cl_kernels/common/dequantization_layer.cl @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET) + +/** This performs the dequantization of 8-bit unsigned integers to floating point. + * + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Quantization scale of input tensor is passed in with -DSCALE=scale. + * @note Quantization offset of input tensor is passed in with -DOFFSET=offset. + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void dequantization_layer( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(int, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + + // Create scale and offset vectors + const VEC_DATA_TYPE(float, VEC_SIZE) + vscale = SCALE; + + const VEC_DATA_TYPE(int, VEC_SIZE) + voffset = OFFSET; + + // Dequantize + VEC_DATA_TYPE(float, VEC_SIZE) + res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE)); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); +#else // !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE)); +#endif // defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl index 45dcbfc6e2..45dcbfc6e2 100644 --- a/src/core/CL/cl_kernels/elementwise_operation.cl +++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl index a11be80875..a11be80875 100644 --- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl +++ b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl index d2d9d97d33..d2d9d97d33 100644 --- a/src/core/CL/cl_kernels/elementwise_unary.cl +++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/common/fft.cl index 51763a620a..3f26d0f1a6 100644 --- a/src/core/CL/cl_kernels/fft.cl +++ b/src/core/CL/cl_kernels/common/fft.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl index de566212c6..5f64d95bf9 100644 --- a/src/core/CL/cl_kernels/fft_digit_reverse.cl +++ b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/common/fft_scale.cl index 57e25ef504..c799dd3b9e 100644 --- a/src/core/CL/cl_kernels/fft_scale.cl +++ b/src/core/CL/cl_kernels/common/fft_scale.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/common/fill_border.cl index 5775d899e8..a43343c9f4 100644 --- a/src/core/CL/cl_kernels/fill_border.cl +++ b/src/core/CL/cl_kernels/common/fill_border.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/common/floor.cl index f6dd4edd2e..f6dd4edd2e 100644 --- a/src/core/CL/cl_kernels/floor.cl +++ b/src/core/CL/cl_kernels/common/floor.cl diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/common/gather.cl index 41f439cb47..76eaefa92e 100644 --- a/src/core/CL/cl_kernels/gather.cl +++ b/src/core/CL/cl_kernels/common/gather.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl index 10435d376f..10435d376f 100644 --- a/src/core/CL/cl_kernels/gemm.cl +++ b/src/core/CL/cl_kernels/common/gemm.cl diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/common/gemm_v1.cl index a136a1b96b..a136a1b96b 100644 --- a/src/core/CL/cl_kernels/gemm_v1.cl +++ b/src/core/CL/cl_kernels/common/gemm_v1.cl diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl index 5cafb5389c..5cafb5389c 100644 --- a/src/core/CL/cl_kernels/gemmlowp.cl +++ b/src/core/CL/cl_kernels/common/gemmlowp.cl diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/common/gemv.cl index aaa83975f8..71a372eb29 100644 --- a/src/core/CL/cl_kernels/gemv.cl +++ b/src/core/CL/cl_kernels/common/gemv.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl index e8306c55a8..5b8502072a 100644 --- a/src/core/CL/cl_kernels/generate_proposals.cl +++ b/src/core/CL/cl_kernels/common/generate_proposals.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl index 04264197f4..70f861c4b7 100644 --- a/src/core/CL/cl_kernels/generate_proposals_quantized.cl +++ b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/common/instance_normalization.cl index adfbebd67d..adfbebd67d 100644 --- a/src/core/CL/cl_kernels/instance_normalization.cl +++ b/src/core/CL/cl_kernels/common/instance_normalization.cl diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/common/l2_normalize.cl index fbe3406239..fbe3406239 100644 --- a/src/core/CL/cl_kernels/l2_normalize.cl +++ b/src/core/CL/cl_kernels/common/l2_normalize.cl diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl index 76be629934..05727a6aa6 100644 --- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl +++ b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019, 2021 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/common/memset.cl index bb46a49f84..9ff25f3af4 100644 --- a/src/core/CL/cl_kernels/memset.cl +++ b/src/core/CL/cl_kernels/common/memset.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/common/minmax_layer.cl index 655696f9a1..49356451df 100644 --- a/src/core/CL/cl_kernels/minmax_layer.cl +++ b/src/core/CL/cl_kernels/common/minmax_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/common/nonmax.cl index ab13131807..702e635a89 100644 --- a/src/core/CL/cl_kernels/nonmax.cl +++ b/src/core/CL/cl_kernels/common/nonmax.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/common/pad_layer.cl index 903e924a2f..5ae4ec884d 100644 --- a/src/core/CL/cl_kernels/pad_layer.cl +++ b/src/core/CL/cl_kernels/common/pad_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/common/permute.cl index db9e7ecc25..a03eeb1a19 100644 --- a/src/core/CL/cl_kernels/permute.cl +++ b/src/core/CL/cl_kernels/common/permute.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl index 10875293a9..10875293a9 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl +++ b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl index 6d1c2d0c79..6d1c2d0c79 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl +++ b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl diff --git a/src/core/CL/cl_kernels/common/pooling_layer.cl b/src/core/CL/cl_kernels/common/pooling_layer.cl new file mode 100644 index 0000000000..5122f2c251 --- /dev/null +++ b/src/core/CL/cl_kernels/common/pooling_layer.cl @@ -0,0 +1,390 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "repeat.h" +#include "tile_helpers.h" + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(x, y) (fmax((x), (y))) +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if STRIDE_X == 1 +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output) +#elif STRIDE_X == 2 /* STRIDE_X == 1 */ +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output) +#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */ +#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output) +#endif /* STRIDE_X == 3 */ + +#if defined(FP_MIXED_PRECISION) +#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n)) +#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \ + CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n) +#else /* defined(FP_MIXED_PRECISION) */ +#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr) +#endif /* defined(FP_MIXED_PRECISION) */ + +#define POOLING3x3_STRIDE1(res, input, output) \ + ({ \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \ + data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \ + data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \ + data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \ + data00 = POW2_OP(data00, 4); \ + data01 = POW2_OP(data01, 2); \ + data10 = POW2_OP(data10, 4); \ + data11 = POW2_OP(data11, 2); \ + data20 = POW2_OP(data20, 4); \ + data21 = POW2_OP(data21, 2); \ + \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \ + \ + values00 = POOL_OP(values00, values10); \ + values01 = POOL_OP(values01, values11); \ + values00 = POOL_OP(values00, values20); \ + values01 = POOL_OP(values01, values21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \ + }) + +#define POOLING3x3_STRIDE2(res, input, output) \ + ({ \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8)); \ + data00 = POW2_OP(data00, 8); \ + data01 = POW2_OP(data01, 1); \ + data10 = POW2_OP(data10, 8); \ + data11 = POW2_OP(data11, 1); \ + data20 = POW2_OP(data20, 8); \ + data21 = POW2_OP(data21, 1); \ + \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21); \ + \ + values00 = POOL_OP(values00, values10); \ + values01 = POOL_OP(values01, values11); \ + values00 = POOL_OP(values00, values20); \ + values01 = POOL_OP(values01, values21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \ + }) + +#define POOLING3x3_STRIDE3(res, input, output) \ + ({ \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ + data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ + data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ + data00 = POW2_OP(data00, 8); \ + data01 = POW2_OP(data01, 4); \ + data10 = POW2_OP(data10, 8); \ + data11 = POW2_OP(data11, 4); \ + data20 = POW2_OP(data20, 8); \ + data21 = POW2_OP(data21, 4); \ + \ + data00 = POOL_OP(data00, data10); \ + data01 = POOL_OP(data01, data11); \ + data00 = POOL_OP(data00, data20); \ + data01 = POOL_OP(data01, data21); \ + \ + res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \ + res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03)); \ + }) + +ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = get_global_id(0) * stride_x - pad_x; + int start_y = get_global_id(1) * stride_y - pad_y; + const int end_x = min(start_x + pool_size_x, upper_bound_w); + const int end_y = min(start_y + pool_size_y, upper_bound_h); +#if defined(EXCLUDE_PADDING) + start_x = max(0, start_x); + start_y = max(0, start_y); +#endif /* defined(EXCLUDE_PADDING) */ + return ((end_y - start_y) * (end_x - start_x)); +} + +/** Performs a pooling function of pool size equal to 2. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_layer_2( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + // Load data + VEC_DATA_TYPE(ACC_DATA_TYPE, 2) + data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); + VEC_DATA_TYPE(ACC_DATA_TYPE, 2) + data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 = POW2_OP(data0, 2); + data1 = POW2_OP(data1, 2); +#endif /* defined(POOL_L2) */ + + // Perform calculations + data0 = POOL_OP(data0, data1); + ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1); + +#if defined(POOL_AVG) || defined(POOL_L2) + // Divide by pool region in case of average or l2 pooling + res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res; +} + +/** Performs a pooling function of pool size equal to 3 + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_layer_3( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + // Load data + VEC_DATA_TYPE(ACC_DATA_TYPE, 3) + data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); + VEC_DATA_TYPE(ACC_DATA_TYPE, 3) + data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); + VEC_DATA_TYPE(ACC_DATA_TYPE, 3) + data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 = POW2_OP(data0, 3); + data1 = POW2_OP(data1, 3); + data2 = POW2_OP(data2, 3); +#endif /* defined(POOL_L2) */ + + // Perform calculations + data0 = POOL_OP(data0, data1); + data0 = POOL_OP(data0, data2); + ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2); + +#if defined(POOL_AVG) || defined(POOL_L2) + // Divide by pool region in case of average pooling + res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res; +} + +#if defined(POOLING3x3) + +#define CONVERT_OP(data_type) convert_##data_type##4 +#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type) + +VEC_DATA_TYPE(ACC_DATA_TYPE, 4) +calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x; + int start_y = get_global_id(1) * stride_y - pad_y; + const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w); + const int end_y = min(start_y + pool_size, upper_bound_h); +#if defined(EXCLUDE_PADDING) + start_x = max((int4)0, start_x); + start_y = max(0, start_y); +#endif /* defined(EXCLUDE_PADDING) */ + return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x)); +} + +/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_layer_optimized_3( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) + res; + + // Perform pooling 3x3 for 4 output elements + POOLING3x3(res, input, output); + +#if defined(POOL_AVG) || defined(POOL_L2) + // Divide by pool region in case of average pooling + res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr); +} +#endif // defined(POOLING3x3) diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl index 24cb111772..4494dd8cec 100644 --- a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl +++ b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl index 3538dae5f0..69cc288c25 100644 --- a/src/core/CL/cl_kernels/quantization_layer.cl +++ b/src/core/CL/cl_kernels/common/quantization_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -80,8 +80,8 @@ __kernel void quantization_layer( // Create scale and offset vectors const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE; - const VEC_DATA_TYPE(int, VEC_SIZE) voffset = OFFSET; -#else // defined(IS_FLOAT) + const VEC_DATA_TYPE(int, VEC_SIZE) voffset = OFFSET; +#else // defined(IS_FLOAT) // Load data VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr); diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/common/range.cl index d25d10e207..d25d10e207 100644 --- a/src/core/CL/cl_kernels/range.cl +++ b/src/core/CL/cl_kernels/common/range.cl diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl index 9f2c6e23b5..9f2c6e23b5 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/common/reduction_operation.cl diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/common/reshape_layer.cl index 2d6a7edade..bfdefc863e 100644 --- a/src/core/CL/cl_kernels/reshape_layer.cl +++ b/src/core/CL/cl_kernels/common/reshape_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl index 10ffe84aeb..6b0afb9c2c 100644 --- a/src/core/CL/cl_kernels/reverse.cl +++ b/src/core/CL/cl_kernels/common/reverse.cl @@ -1,5 +1,5 @@ /* -* Copyright (c) 2018-2020 Arm Limited. +* Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/common/roi_align_layer.cl index e0b98e68c9..8cfe5ddcb6 100644 --- a/src/core/CL/cl_kernels/roi_align_layer.cl +++ b/src/core/CL/cl_kernels/common/roi_align_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -173,7 +173,7 @@ __kernel void roi_align_layer( const float2 roi_bin_grid = SAMPLING_RATIO; #else // !defined(SAMPLING_RATIO) // Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2. - const float2 roi_bin_grid = ceil(bin_size - EPS_GRID); + const float2 roi_bin_grid = ceil(bin_size - EPS_GRID); #endif // defined(SAMPLING_RATIO) // Move input and output pointer across the fourth dimension @@ -184,7 +184,7 @@ __kernel void roi_align_layer( #if defined(NHWC) __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py); #else // !defined(NHWC) - __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz); + __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz); #endif // defined(NHWC) *_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input, region_start.x, diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl index d5c9a0d9bf..e75dee06f6 100644 --- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl +++ b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2019-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl index 6899b952e0..6899b952e0 100644 --- a/src/core/CL/cl_kernels/roi_pooling_layer.cl +++ b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/common/select.cl index 6fd4bd4ce3..6fd4bd4ce3 100644 --- a/src/core/CL/cl_kernels/select.cl +++ b/src/core/CL/cl_kernels/common/select.cl diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/common/slice_ops.cl index dc3ffd91c1..d12c60f5ea 100644 --- a/src/core/CL/cl_kernels/slice_ops.cl +++ b/src/core/CL/cl_kernels/common/slice_ops.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl index 4d2d89dd73..4d2d89dd73 100644 --- a/src/core/CL/cl_kernels/softmax_layer.cl +++ b/src/core/CL/cl_kernels/common/softmax_layer.cl diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl index 4d5006d804..4d5006d804 100644 --- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl +++ b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/common/stack_layer.cl index 438e858df2..2468bf750d 100644 --- a/src/core/CL/cl_kernels/stack_layer.cl +++ b/src/core/CL/cl_kernels/common/stack_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/common/tile.cl index 79da7fe6b9..4332411688 100644 --- a/src/core/CL/cl_kernels/tile.cl +++ b/src/core/CL/cl_kernels/common/tile.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl index 82db2908b5..82db2908b5 100644 --- a/src/core/CL/cl_kernels/transpose.cl +++ b/src/core/CL/cl_kernels/common/transpose.cl diff --git a/src/core/CL/cl_kernels/unpooling_layer.cl b/src/core/CL/cl_kernels/common/unpooling_layer.cl index 457e9bf8f1..6662dc9360 100644 --- a/src/core/CL/cl_kernels/unpooling_layer.cl +++ b/src/core/CL/cl_kernels/common/unpooling_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2020 Arm Limited. + * Copyright (c) 2020-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl deleted file mode 100644 index 127f67d940..0000000000 --- a/src/core/CL/cl_kernels/dequantization_layer.cl +++ /dev/null @@ -1,212 +0,0 @@ -/* - * Copyright (c) 2017-2020 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET) - -/** This performs the dequantization of 8-bit unsigned integers to floating point. - * - * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char - * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Quantization scale of input tensor is passed in with -DSCALE=scale. - * @note Quantization offset of input tensor is passed in with -DOFFSET=offset. - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void dequantization_layer( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - -#if defined(LAST_ACCESSED_X) - // Check if access on width gets out of bounds - // If it does shift access vector to access elements within bounds - const int xi = (int)(get_global_id(0) * VEC_SIZE); - input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; - output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; - - // Load data - VEC_DATA_TYPE(int, VEC_SIZE) - val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); - - // Create scale and offset vectors - const VEC_DATA_TYPE(float, VEC_SIZE) - vscale = SCALE; - - const VEC_DATA_TYPE(int, VEC_SIZE) - voffset = OFFSET; - - // Dequantize - VEC_DATA_TYPE(float, VEC_SIZE) - res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE)); - - // Store result - VSTORE(VEC_SIZE) - (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); -#else // !defined(LAST_ACCESSED_X) - *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE)); -#endif // defined(LAST_ACCESSED_X) -} -#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET) - -#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) -/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW) - * - * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char - * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] scale Pointer to buffer with the per channel quantized scales - */ -__kernel void dequantization_layer_per_channel_nchw( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output), - __global float *scale) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - -#if defined(LAST_ACCESSED_X) - // Check if access on width gets out of bounds - // If it does shift access vector to access elements within bounds - const int xi = (int)(get_global_id(0) * VEC_SIZE); - input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; - output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; - - // Load data - VEC_DATA_TYPE(int, VEC_SIZE) - val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); - - // Create scale vectors - const VEC_DATA_TYPE(float, VEC_SIZE) - vscale = scale[get_global_id(2)]; - - // Dequantize - VEC_DATA_TYPE(float, VEC_SIZE) - res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE)); - - // Store result - VSTORE(VEC_SIZE) - (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); -#else // !defined(LAST_ACCESSED_X) - *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]); -#endif // defined(LAST_ACCESSED_X) -} -/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC) - * - * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char - * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] scale Pointer to buffer with the per channel quantized scales - */ -__kernel void dequantization_layer_per_channel_nhwc( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output), - __global float *scale) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - -#if defined(LAST_ACCESSED_X) - // Check if access on width gets out of bounds - // If it does shift access vector to access elements within bounds - const int xi = (int)(get_global_id(0) * VEC_SIZE); - input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; - output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; - scale -= max(xi - (int)LAST_ACCESSED_X, 0); - - // Load data - VEC_DATA_TYPE(int, VEC_SIZE) - val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); - - // Create scale vectors - const VEC_DATA_TYPE(float, VEC_SIZE) - vscale = VLOAD(VEC_SIZE)(0, &scale[xi]); - - // Dequantize - VEC_DATA_TYPE(float, VEC_SIZE) - res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE)); - - // Store result - VSTORE(VEC_SIZE) - (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); -#else // !defined(LAST_ACCESSED_X) - *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]); -#endif // defined(LAST_ACCESSED_X) -} -#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl index f301e64d66..89129cff3f 100644 --- a/src/core/CL/cl_kernels/depth_to_space.cl +++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2021 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,14 +23,14 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) -/** Depth to space transformation. (NCHW) +#if defined(DATA_TYPE) && defined(BATCH_SIZE) +/** Batch to space transformation. (NCHW) * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 - * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -39,6 +39,12 @@ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor * @param[in] batch_id The input tensor batch id + * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) @@ -48,31 +54,41 @@ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void depth_to_space_nchw( +__kernel void batch_to_space_nchw( TENSOR3D_DECLARATION(input), const int batch_id, + VECTOR_DECLARATION(block_shape), TENSOR4D_DECLARATION(output)) { - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); + + const int block_x = *((__global int *)vector_offset(&block, 0)); + const int block_y = *((__global int *)vector_offset(&block, 1)); - const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int r = (BATCH_SIZE / (block_x * block_y)); const int x = get_global_id(0); const int y = get_global_id(1); - const int z = get_global_id(2) % r; + const int z = get_global_id(2); + const int w = batch_id % r; - const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE; - const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE; + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; - *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr); + *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr); } -/** Depth to space transformation. (NHWC) +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) + +#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) +/** Batch to space transformation. (NCHW) * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 - * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 + * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -90,7 +106,7 @@ __kernel void depth_to_space_nchw( * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void depth_to_space_nhwc( +__kernel void batch_to_space_static_nchw( TENSOR3D_DECLARATION(input), const int batch_id, TENSOR4D_DECLARATION(output)) @@ -98,14 +114,18 @@ __kernel void depth_to_space_nhwc( Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); - const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); - const int x = get_global_id(1); - const int y = get_global_id(2); - const int z = get_global_id(0) % r; + const int block_x = BLOCK_SHAPE_X; + const int block_y = BLOCK_SHAPE_Y; + + const int r = (BATCH_SIZE / (block_x * block_y)); + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2); + const int w = batch_id % r; - const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE; - const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE; + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; - *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr); + *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr); } -#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl new file mode 100644 index 0000000000..2d466661b3 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl @@ -0,0 +1,147 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#define ADD_OP(a, b) ((a) + (b)) +#define SUB_OP(a, b) ((a) - (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define INVSQRT_OP(a) rsqrt((a)) +#define SQCVT_SAT(a) (a) + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE) +#include "activation_float_helpers.h" + +/** Apply batch normalization. + * + * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr + * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + * @param[in] epsilon Epsilon parameter in the batch normalization equation + */ +__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR3D_DECLARATION(output), +#endif /* not IN_PLACE */ + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(var), +#ifndef USE_DEFAULT_BETA + VECTOR_DECLARATION(beta), +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + VECTOR_DECLARATION(gamma), +#endif /* USE_DEFAULT_GAMMA */ + float epsilon) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); +#ifdef IN_PLACE + Tensor3D out = in; +#else /* IN_PLACE */ + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); +#endif /* IN_PLACE */ + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector var = CONVERT_TO_VECTOR_STRUCT(var); +#ifndef USE_DEFAULT_BETA + Vector beta = CONVERT_TO_VECTOR_STRUCT(beta); +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma); +#endif /* USE_DEFAULT_GAMMA */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + denominator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + numerator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + x_bar = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res = 0; + + const int current_slice = get_global_id(2); + + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr); + denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x)); + denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon)))); + + // Calculate x bar and store results + numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x)); + numerator = SUB_OP(data, numerator); + x_bar = MUL_OP(numerator, denominator); + +#ifndef USE_DEFAULT_GAMMA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x)); + + res = MUL_OP(gamma_vec, x_bar); +#else /* USE_DEFAULT_GAMMA */ + // gamma is equal to 1, no need to perform multiplications + res = x_bar; +#endif /* USE_DEFAULT_GAMMA */ + +#ifndef USE_DEFAULT_BETA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x)); + // beta is not zero, hence we need to perform the addition + res = ADD_OP(res, beta_vec); +#endif /* USE_DEFAULT_BETA */ + + res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL); + + VSTORE(VEC_SIZE) + (res, 0, (__global DATA_TYPE *)out.ptr); +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl new file mode 100644 index 0000000000..57d82e1e6f --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl @@ -0,0 +1,103 @@ +/* +* Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) + +// Check valid VEC_SIZES +#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16 +#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported" +#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16 + +#define DIV_MOD_UINT(x, y, div_res, mod_res) \ + ({ \ + div_res = (uint)((x) * (float)(1.0f / (float)(y))); \ + uint r = div_res * (y); \ + mod_res = (x)-r; \ + }) + +/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details. + * + * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4 + * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64 + * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2 + * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1 + * K is equal to num_channels / num_groups. + * + * @param[in] src_ptr Pointer to the source matrix. Supported data types: All + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst)) +{ + uint curr_channel = 0; // channel id of input + uint batch_id = 0; // batch id + uint group_id = 0; // group id + uint channel_id = 0; // channel id within the group + + // Compute curr_channel and batch_id + DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel); + + // Compute group_id and channel_id + DIV_MOD_UINT(curr_channel, K, group_id, channel_id); + + const uint x = get_global_id(0) * VEC_SIZE; + const uint y = get_global_id(1) * 2; + const uint z = channel_id * NUM_GROUPS + group_id; + + // Load the Nx2 block + const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); + + // Store blocks + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w; + VSTORE(VEC_SIZE) + (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y)); + VSTORE(VEC_SIZE) + (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y)); +} + +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl new file mode 100644 index 0000000000..b9f223fe9d --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Depth to space transformation. (NCHW) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void depth_to_space_nchw( + TENSOR3D_DECLARATION(input), + const int batch_id, + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2) % r; + + const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE; + const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl new file mode 100644 index 0000000000..e0203f7408 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) +/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW) + * + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale Pointer to buffer with the per channel quantized scales + */ +__kernel void dequantization_layer_per_channel_nchw( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + __global float *scale) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(int, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + + // Create scale vectors + const VEC_DATA_TYPE(float, VEC_SIZE) + vscale = scale[get_global_id(2)]; + + // Dequantize + VEC_DATA_TYPE(float, VEC_SIZE) + res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE)); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); +#else // !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]); +#endif // defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl index 8ab2d1d4ea..8ab2d1d4ea 100644 --- a/src/core/CL/cl_kernels/direct_convolution1x1.cl +++ b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl index 811df053c4..811df053c4 100644 --- a/src/core/CL/cl_kernels/direct_convolution3x3.cl +++ b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl index 59d668f0bf..59d668f0bf 100644 --- a/src/core/CL/cl_kernels/direct_convolution5x5.cl +++ b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl index b80d4f587e..b80d4f587e 100644 --- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl +++ b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl index a1467a0b36..fddf918c63 100644 --- a/src/core/CL/cl_kernels/im2col.cl +++ b/src/core/CL/cl_kernels/nchw/im2col.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -22,7 +22,6 @@ * SOFTWARE. */ #include "helpers.h" - #if defined(DATA_TYPE) && defined(ELEMENT_SIZE) #if ELEMENT_SIZE == 1 @@ -861,500 +860,4 @@ __kernel void im2col_generic_padx0_pady0_nchw( #endif // HAS_BIAS } #endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE) - -#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) - -#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) -#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE) - -/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension - * @name IM2COL1X9_NHWC_STORE - * - * @note To use this macro for a 3x3 block, @p ROW has to be 0 - * - * @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16 - * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size - * @param[in] DATA_TYPE Data type of @p DATA - * @param[in] SRC_DEPTH Input channel size / depth - * @param[in] DATA Value variable base name - * @param[in] ROW The row number to store. Supported: 0-8 - * @param[in] OUTPUT_PTR Output pointer - * @{ - */ -#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE -#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ - const bool at_channel_boundary = get_global_id(0) == 0; \ - if(at_channel_boundary) \ - { \ - IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ - } \ - else \ - { \ - IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ - } -#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE -#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ - IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) -#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE - -#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ - VSTORE(VECTOR_SIZE) \ - (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ - VSTORE(VECTOR_SIZE) \ - (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); - -#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ - (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); -/** @}*/ - -/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC - * - * @note This kernel computes VECTOR_SIZE elements - * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements - * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 - * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 - * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 - * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). - */ -__kernel void im2col3x3_nhwc( - TENSOR3D_DECLARATION(src), - IMAGE_DECLARATION(dst), - uint src_stride_w, - uint dst_stride_w) -{ - // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding - const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; - const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); - const int yo = get_global_id(1); - const int batch = get_global_id(2); // batch size - - // Calculate input indices - const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; - const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; - - // Get input and output address - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; - - int yi_coord = 0; - int3 offset = 0; - - // Clamp xi - int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT); -#if PAD_LEFT != 0 || PAD_RIGHT != 0 -#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) - xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1)); -#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 - // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor - xi_offset *= (int3)src_stride_y; - - // Out-of-bound condition for X - int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH); - - // yi == 0 - // Clamp yi - // yi_coord is casted to unsigned int in order to use just a min() operation - // A "-1" 32 bit signed variable converted to unsigned gives 4294967295 - // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1), - // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around - // also causes y_cond (y padding condition) to be satisfied - yi_coord = yi - (int)PAD_TOP; - - // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 -#if PAD_TOP != 0 || PAD_BOTTOM != 0 - yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); -#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 - - // Compute offset - offset = xi_offset + (yi_coord * (int)src_stride_z); - - // Load input values - VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); - VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); - VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); - -#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 - // Replace invalid values with PAD_VALUE - int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT)); - values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); - values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); - values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); -#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 - - // yi == 1 - // Clamp yi_coord (it can be negative if PAD_TOP > 1) - yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y; - - // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 -#if PAD_TOP != 0 || PAD_BOTTOM != 0 - yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); -#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 - - // Compute offset - offset = xi_offset + (yi_coord * (int)src_stride_z); - - // Load input values - VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); - VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); - VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); - -#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 - // Replace invalid values with zeros - y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT)); - values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); - values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); - values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); -#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 - - // yi == 2 - // Clamp yi_coord - yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y; - - // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 -#if PAD_TOP != 0 || PAD_BOTTOM != 0 - yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); -#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 - - // Compute offset - offset = xi_offset + (yi_coord * (int)src_stride_z); - - // Load input values - VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); - VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); - VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); - -#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 - // Replace invalid values with PAD_VALUE - y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT)); - values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); - values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); - values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); -#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 - - // Store in a boundary-aware way to avoid padding - IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr) - -#ifdef HAS_BIAS - // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is - // added at the end of the channel, while the boundary vec is at the beginning of the channel. - // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in - // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE - // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp - if((ch + VECTOR_SIZE) >= SRC_DEPTH) - { - *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f; - } -#endif // HAS_BIAS -} - -#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 -#define IM2COL1x9(i) \ - ({ \ - yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ - yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ - \ - offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ - offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ - \ - VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ - VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ - VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ - VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ - VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ - VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ - VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ - VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ - VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ - \ - int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \ - values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \ - values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \ - values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \ - values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \ - values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \ - values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \ - values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \ - values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \ - values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \ - \ - IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ - }) -#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 -#define IM2COL1x9(i) \ - ({ \ - yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ - yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ - \ - offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ - offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ - \ - VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ - VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ - VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ - VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ - VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ - VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ - VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ - VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ - VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ - \ - IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ - }) -#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 - -/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC - * - * @note This kernel computes VECTOR_SIZE elements - * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements - * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 - * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 - * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 - * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). - */ -__kernel void im2col9x9_nhwc( - TENSOR3D_DECLARATION(src), - IMAGE_DECLARATION(dst), - uint src_stride_w, - uint dst_stride_w) -{ - // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding - const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; - const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); - const int yo = get_global_id(1); - const int batch = get_global_id(2); // batch size - - // Calculate input indices - const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; - const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; - - // Get input and output address - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; - - int yi_coord = 0; - int8 offset0 = 0; - int offset1 = 0; - - // Clamp xi - int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT); - int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT); - -#if PAD_LEFT != 0 || PAD_RIGHT != 0 -#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) - xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1)); - xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1)); -#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 - xi_offset0 *= (int8)src_stride_y; - xi_offset1 *= (int)src_stride_y; - - // Out-of-bound condition for X - int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH); - int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); - - IM2COL1x9(0); - IM2COL1x9(1); - IM2COL1x9(2); - IM2COL1x9(3); - IM2COL1x9(4); - IM2COL1x9(5); - IM2COL1x9(6); - IM2COL1x9(7); - IM2COL1x9(8); - -#ifdef HAS_BIAS - // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is - // added at the end of the channel, while the boundary vec is at the beginning of the channel. - // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in - // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE - // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp - if((ch + VECTOR_SIZE) >= SRC_DEPTH) - { - *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f; - } -#endif // HAS_BIAS -} - -/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC - * - * @note This kernel computes VECTOR_SIZE elements - * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements - * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 - * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 - * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 - * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 - * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 - * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 - * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 - * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 - * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). - */ -__kernel void im2col_generic_nhwc( - TENSOR3D_DECLARATION(src), - IMAGE_DECLARATION(dst), - uint src_stride_w, - uint dst_stride_w) -{ - // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding - const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; - const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); - const int yo = get_global_id(1); - const int batch = get_global_id(2); // batch size - - // Calculate input indices - const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; - const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; - - // Get input and output address - __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; - - int i = 0; - for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) - { - // Clamp yi_coord - int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP; - yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1)); - - // Out-of-bound condition for Y - int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT); - - for(int xk = 0; xk < KERNEL_WIDTH; ++xk) - { - // Clamp xi_coord - int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT); - xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1)); - - // Out-of-bound condition for X - int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); - - int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z); - - VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset)); - -#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 - // Replace with PAD_VALUE if the value is out-of-bound - values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition))); -#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 - - // Store in a boundary-aware way to avoid padding -#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE - const bool at_channel_boundary = get_global_id(0) == 0; - if(at_channel_boundary) - { - VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) - (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); - } - else // at_channel_boundary -#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE - { - VSTORE(VECTOR_SIZE) - (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); - } - i++; - } - } - -#ifdef HAS_BIAS - // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is - // added at the end of the channel, while the boundary vec is at the beginning of the channel. - // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in - // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE - // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp - if((ch + VECTOR_SIZE) >= SRC_DEPTH) - { - *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f; - } -#endif // HAS_BIAS -} -#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) -#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) +#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl new file mode 100644 index 0000000000..0fef98e295 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#define MUL_OP(x, y) ((x) * (y)) +#define ADD_OP(x, y) ((x) + (y)) +#define DIV_OP(x, y) ((x) / (y)) +#define POW_OP(x, y) pow((x), (y)) +#define SQCVT_SAT(a) (a) + +#if defined(NUM_SLICES) +/** Apply cross-map normalization. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 + * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 + * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192 + * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0; + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA); + + const int current_slice = get_global_id(2); + const int left_slice = max(-(int)RADIUS, -current_slice); + const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice); + + for(int i = left_slice; i <= right_slice; i++) + { + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i)); + acc = ADD_OP(acc, MUL_OP(values, values)); + } + + acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized = POW_OP(acc, beta_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); + + VSTORE(VEC_SIZE) + (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); +} +#endif /* defined(NUM_SLICES) */ + +#if defined(WIDTH_SIZE) +/** Apply in-map normalization when tensors are in the NCHW data layout format. + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 + * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 + * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA + * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1 + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + acc = 0; + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + coeff_v = SQCVT_SAT(COEFF); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_v = SQCVT_SAT(BETA); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + kappa_v = SQCVT_SAT(KAPPA); + + const int current_col = get_global_id(0) << 2; + const int left_pos = max(-(int)RADIUS, -3 - current_col); + const int right_pos = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col); + +#if defined(IN_MAP_2D) + const int current_row = get_global_id(1); + const int first_row = max(-(int)RADIUS, -current_row); + const int last_row = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row); +#endif /* defined(IN_MAP_2D) */ + +#if defined(IN_MAP_2D) + for(int j = first_row; j <= last_row; ++j) + { +#endif /* defined(IN_MAP_2D) */ + for(int i = left_pos; i <= right_pos; ++i) + { +#if defined(IN_MAP_2D) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0)); +#else /* defined(IN_MAP_2D) */ + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0)); +#endif /* defined(IN_MAP_2D) */ + acc = ADD_OP(acc, MUL_OP(values, values)); + } +#if defined(IN_MAP_2D) + } +#endif /* defined(IN_MAP_2D) */ + + acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized = POW_OP(acc, beta_v); + const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); + + VSTORE(VEC_SIZE) + (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); +} +#endif // defined(WIDTH_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl new file mode 100644 index 0000000000..23a0de76f7 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +/** Apply normalize_planar_yuv layer on tensors with NCHW data layout. + * + * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 + * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8 + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr + * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) + * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor + */ +__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(std)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector std = CONVERT_TO_VECTOR_STRUCT(std); + + const uint current_slice = get_global_id(2) % NUM_CHANNELS; + + const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))); + const DATA_TYPE curr_std = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))); + + TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); + TYPE res = (data - curr_mean) / curr_std; + + VSTORE(VEC_SIZE) + (res, 0, (__global DATA_TYPE *)dst.ptr); +} +#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl new file mode 100644 index 0000000000..0f02ef6184 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) + +#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) +#define OFFSET_FLT ((float)OFFSET) +#define SCALE_FLT ((float)SCALE) + +#if defined(NUM_CHANNELS) + +/** Apply normalize_planar_yuv layer on tensors with NCHW data layout. + * + * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 + * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8 + * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8 + * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8 + * + * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED + * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr + * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) + * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor + */ +__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst), + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(std)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); + Vector std = CONVERT_TO_VECTOR_STRUCT(std); + + const uint current_slice = get_global_id(2) % NUM_CHANNELS; + + VEC_DATA_TYPE(float, VEC_SIZE) + curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)))); + curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT; + + VEC_DATA_TYPE(float, VEC_SIZE) + curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)))); + curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT; + + VEC_DATA_TYPE(float, VEC_SIZE) + data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE)); + data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT; + + // Perform normalization + VEC_DATA_TYPE(float, VEC_SIZE) + res_flt = (data_flt - curr_mean_flt) / curr_std_flt; + + const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE); + VSTORE(VEC_SIZE) + (res_u8, 0, (__global DATA_TYPE *)dst.ptr); +} + +#endif // defined(NUM_CHANNELS) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl new file mode 100644 index 0000000000..790ddb381a --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl @@ -0,0 +1,331 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "repeat.h" +#include "tile_helpers.h" + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(x, y) (fmax((x), (y))) +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(FP_MIXED_PRECISION) +#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n)) +#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \ + CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n) +#else /* defined(FP_MIXED_PRECISION) */ +#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr) +#endif /* defined(FP_MIXED_PRECISION) */ + +ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = get_global_id(0) * stride_x - pad_x; + int start_y = get_global_id(1) * stride_y - pad_y; + const int end_x = min(start_x + pool_size_x, upper_bound_w); + const int end_y = min(start_y + pool_size_y, upper_bound_h); +#if defined(EXCLUDE_PADDING) + start_x = max(0, start_x); + start_y = max(0, start_y); +#endif /* defined(EXCLUDE_PADDING) */ + return ((end_y - start_y) * (end_x - start_x)); +} + +#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) + +/** Performs a pooling function of pool size equal to N (NCHW) + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_layer_MxN_nchw( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) + vdata = INITIAL_VALUE; + ACC_DATA_TYPE sdata = INITIAL_VALUE; + + // Load data + for(int y = 0; y < POOL_SIZE_Y; y++) + { + int x = 0; + for(; x <= ((int)POOL_SIZE_X - 8); x += 8) + { + VEC_DATA_TYPE(ACC_DATA_TYPE, 8) + data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif /* defined(POOL_L2) */ + vdata = POOL_OP(vdata, data0); + } + + // Leftover + for(; x < (int)POOL_SIZE_X; ++x) + { + ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0))); +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif /* defined(POOL_L2) */ + sdata = POOL_OP(sdata, data0); + } + } + + // Reduce result + VEC_DATA_TYPE(ACC_DATA_TYPE, 4) + reduce4 = POOL_OP(vdata.s0123, vdata.s4567); + VEC_DATA_TYPE(ACC_DATA_TYPE, 2) + reduce2 = POOL_OP(reduce4.s01, reduce4.s23); + ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1); + res = POOL_OP(res, sdata); + +#if defined(POOL_AVG) || defined(POOL_L2) + // Divide by pool region in case of average pooling + res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res = SQRT_OP(res); +#endif /* defined(POOL_L2) */ + + // Store result + *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res; +} +#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + +inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom) +{ + const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT; + const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM; + + const int x = get_global_id(0) * STRIDE_X; + const int y = get_global_id(1) * STRIDE_Y; + const int z = get_global_id(2); + + //x axis: width, y axis: height, z axis: component + const uint padded_offset = input->offset_first_element_in_bytes + + x * input->stride_x + + y * input->stride_y + + z * input->stride_z; + + const uint offset_base = padded_offset + - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */ + - PAD_TENSOR_TOP * input->stride_y /* top padding */ + - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */ + - PAD_TENSOR_LEFT * sizeof(DATA_TYPE); + +#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) + *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT)); +#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ + *offset_top = (uint)(offset_base / sizeof(DATA_TYPE)); +#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ + + *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; + + return; +} + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nchw_indices_fp32( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + TENSOR3D_DECLARATION(indices)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); + + // Load data + float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0)); + float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0)); + + // Perform calculations + float data0_max = POOL_OP(data0.s0, data0.s1); + float data1_max = POOL_OP(data1.s0, data1.s1); + float res = POOL_OP(data0_max, data1_max); + // Store result + *(__global float *)output.ptr = res; + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_top = 0; + uint offset_bottom = 0; + + offset_no_padding_nchw(&input, &offset_top, &offset_bottom); + + uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); + uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); + uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); + + *(__global uint *)indices.ptr = index; + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) +} + +/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. + * + * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16 + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2_nchw_indices_fp16( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + TENSOR3D_DECLARATION(indices)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); + + // Load data + half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0)); + half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0)); + + // Perform calculations + half data0_max = POOL_OP(data0.s0, data0.s1); + half data1_max = POOL_OP(data1.s0, data1.s1); + half res = POOL_OP(data0_max, data1_max); + // Store result + *(__global half *)output.ptr = res; + +#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) + + uint offset_top = 0; + uint offset_bottom = 0; + + offset_no_padding_nchw(&input, &offset_top, &offset_bottom); + + uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); + uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); + uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); + + *(__global uint *)indices.ptr = index; + +#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl new file mode 100644 index 0000000000..1440ef3ed1 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl @@ -0,0 +1,142 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(INITIAL_VALUE) +#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + +#if defined(POOL_AVG) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) */ +#define POOL_OP(x, y) (max((x), (y))) +#endif /* defined(POOL_AVG) */ + +#define DIV_OP(x, y) (x * (1.f / y)) + +#if defined(POOL_L2) +#error "L2 pooling is not supported" +#endif /* defined(POOL_L2) */ + +int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, + const int pad_x, const int pad_y, const int stride_x, const int stride_y) +{ + int start_x = get_global_id(0) * stride_x - pad_x; + int start_y = get_global_id(1) * stride_y - pad_y; + const int end_x = min(start_x + pool_size_x, upper_bound_w); + const int end_y = min(start_y + pool_size_y, upper_bound_h); +#if defined(EXCLUDE_PADDING) + start_x = max(0, start_x); + start_y = max(0, start_y); +#endif /* defined(EXCLUDE_PADDING) */ + return ((end_y - start_y) * (end_x - start_x)); +} + +/** Performs a pooling function of pool size equal to N (NCHW) + * + * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; + * @note In case of average pooling the following information must be passed at compile time: + * -DPOOL_AVG must be provided otherwise max pooling will be performed. + * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) + * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension + * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED + * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void pooling_layer_MxN_quantized_nchw( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output)) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + + int8 vdata = INITIAL_VALUE; + int sdata = INITIAL_VALUE; + + // Load data + for(int y = 0; y < POOL_SIZE_Y; y++) + { + int x = 0; + for(; x <= ((int)POOL_SIZE_X - 8); x += 8) + { + VEC_TYPE(8) + data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + int8 data0 = convert_int8(data); + vdata = POOL_OP(vdata, data0); + } + + // Leftover + for(; x < (int)POOL_SIZE_X; ++x) + { + DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); + int data0 = convert_int(data); + sdata = POOL_OP(sdata, data0); + } + } + + // Reduce result + int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567); + int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23); + int res = POOL_OP(reduce2.s0, reduce2.s1); + res = POOL_OP(res, sdata); + +#if defined(POOL_AVG) + res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))); +#endif /* defined(POOL_AVG) */ + + DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE); + +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + + const float result_f32 = convert_float(result_q8); + const float input_offset = (float)OFFSET_IN1; + const float input_scale = (float)SCALE_IN1; + const float scale_out = (float)SCALE_OUT; + const float offset_out = (float)OFFSET_OUT; + const float in_f32 = (result_f32 - input_offset) * input_scale; + const float out_f32 = in_f32 / scale_out + offset_out; + result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE); + +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + + *(__global DATA_TYPE *)output.ptr = result_q8; +} +#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl index de10decdec..7524ba7b4a 100644 --- a/src/core/CL/cl_kernels/prior_box_layer.cl +++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * diff --git a/src/core/CL/cl_kernels/nchw/remap.cl b/src/core/CL/cl_kernels/nchw/remap.cl new file mode 100644 index 0000000000..fab88a1682 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/remap.cl @@ -0,0 +1,133 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +#ifndef DEPTH_OUT +/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + */ +__kernel void remap_nearest_neighbour_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + IMAGE_DECLARATION(mapx), + IMAGE_DECLARATION(mapy), + const float width, + const float height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); + Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); + + float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); + float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); + float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, + mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); + + vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr); +} + +/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation. + * + * This kernel performs remapping with this method of pixel coordinate translation: + * out(x,y) = in(mapx(x,y), mapy(x,y)); + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image + * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. + * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) + * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) + * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) + * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) + * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image + * @param[in] width Width of the input image + * @param[in] height Height of the input image + */ +__kernel void remap_bilinear_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + IMAGE_DECLARATION(mapx), + IMAGE_DECLARATION(mapy), + const float width, + const float height) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); + Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); + + float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); + float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); + float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, + mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); + + vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr); +} +#endif // DEPTH_OUT
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl index 29344de37a..f66b17c1a6 100644 --- a/src/core/CL/cl_kernels/reorg_layer.cl +++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -72,45 +72,4 @@ __kernel void reorg_layer_nchw( int src_offset = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z; *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset)); } - -/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC - * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float - * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64 - * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2 - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: All - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void reorg_layer_nhwc( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst); - - int xo = get_global_id(1); - int yo = get_global_id(2); - int zo = get_global_id(0); - int xi, yi, zi; - - CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi); - - int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset)); -} #endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl new file mode 100644 index 0000000000..63a53cc4f2 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/scale.cl @@ -0,0 +1,148 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "warp_helpers.h" + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_nearest(const float2 coord, const float2 scale) +{ +#ifdef SAMPLING_POLICY_TOP_LEFT + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + const float4 new_x = in_x_coords * (float4)(scale.s0); + const float4 new_y = (float4)(coord.s1 * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#elif SAMPLING_POLICY_CENTER + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ +} + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_bilinear(const float2 coord, const float2 scale) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); +#ifdef SAMPLING_POLICY_TOP_LEFT + const float4 new_x = in_x_coords * (float4)(scale.s0); + const float4 new_y = (float4)(coord.s1 * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#elif SAMPLING_POLICY_CENTER + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ +} + +/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16. + * + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + */ +__kernel void scale_nearest_neighbour_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(scale_x, scale_y); + float8 transformed = transform_nearest(get_current_coords(), r); +#ifdef ALIGN_CORNERS + transformed = round(transformed); +#endif // ALIGN_CORNERS + const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE); + vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr); +} + +/** Performs an affine transformation on an image interpolating with the BILINEAR method. + * + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * + * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + */ +__kernel void scale_bilinear_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(scale_x, scale_y); + const float8 tc = transform_bilinear(get_current_coords(), r); + vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr); +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl new file mode 100644 index 0000000000..946ad65c14 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/scale_quantized.cl @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers_asymm.h" +#include "warp_helpers_quantized.h" + +/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. + * + * @param[in] coord 2D coordinates to transform. + * @param[in] scale input/output scale ratio + * + * @return a float8 containing 4 2D transformed values in the input image. + */ +inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale) +{ + const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); +#ifdef SAMPLING_POLICY_TOP_LEFT + const float4 new_x = in_x_coords * (float4)(scale.s0); + const float4 new_y = (float4)(coord.s1 * scale.s1); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#elif SAMPLING_POLICY_CENTER + const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); + const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); + return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); +#else /* SAMPLING_POLICY */ +#error("Unsupported sampling policy"); +#endif /* SAMPLING_POLICY */ +} + +/** Performs an affine transformation on an image interpolating with the BILINEAR method. + * + * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT + * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5 + * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1 + * + * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. + * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) + * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image + * @param[in] input_width Input image width + * @param[in] input_height Input image height + * @param[in] scale_x The scale factor along x dimension + * @param[in] scale_y The scale factor along y dimension + */ +__kernel void scale_bilinear_quantized_nchw( + IMAGE_DECLARATION(in), + IMAGE_DECLARATION(out), + const float input_width, + const float input_height, + const float scale_x, + const float scale_y) +{ + Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); + Image out = CONVERT_TO_IMAGE_STRUCT(out); + const float2 r = (float2)(scale_x, scale_y); + const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r); + vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr); +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl index cb11786ac4..e162a29bb0 100644 --- a/src/core/CL/cl_kernels/space_to_batch.cl +++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -93,75 +93,7 @@ __kernel void space_to_batch_nchw( *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w)); } } -/** Calculate the space to batch conversion. (NHWC) - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32 - * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes) - * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes) - * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image - * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32 - * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes) - * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor - * @param[in] batch_id The output tensor batch id - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void space_to_batch_nhwc( - TENSOR4D_DECLARATION(input), - IMAGE_DECLARATION(paddings), - VECTOR_DECLARATION(block_shape), - const int batch_id, - TENSOR3D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings); - Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); - - const int pad_left_x = *((__global int *)offset(&pad, 0, 0)); - const int pad_right_x = *((__global int *)offset(&pad, 1, 0)); - const int pad_left_y = *((__global int *)offset(&pad, 0, 1)); - const int pad_right_y = *((__global int *)offset(&pad, 1, 1)); - - int block_x = *((__global int *)vector_offset(&block, 0)); - int block_y = *((__global int *)vector_offset(&block, 1)); - - const int out_x = get_global_id(1); - const int out_y = get_global_id(2); - const int z = get_global_id(0); - - const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); - const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); - - if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN))) - { - const int w = batch_id % BATCH_IN; - const int in_x = pos_x - pad_left_x; - const int in_y = pos_y - pad_left_y; - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w)); - } -} #endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN) #if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN) @@ -221,60 +153,4 @@ __kernel void space_to_batch_static_nchw( *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w)); } } -/** Calculate the space to batch conversion. (NHWC) - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 - * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 - * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 - * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2 - * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2 - * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2 - * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: All - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image - * @param[in] batch_id The output tensor batch id - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void space_to_batch_static_nhwc( - TENSOR4D_DECLARATION(input), - const int batch_id, - TENSOR3D_DECLARATION(output)) -{ - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); - - int block_x = BLOCK_SHAPE_X; - int block_y = BLOCK_SHAPE_Y; - - const int out_x = get_global_id(1); - const int out_y = get_global_id(2); - const int z = get_global_id(0); - - const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); - const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); - - if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN) - { - const int w = batch_id % BATCH_IN; - const int in_x = pos_x - PAD_LEFT_X; - const int in_y = pos_y - PAD_LEFT_Y; - - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w)); - } -} #endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN) diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl new file mode 100644 index 0000000000..aea02e813b --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Space to depth transformation. (NCHW) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_depth_nchw( + TENSOR4D_DECLARATION(input), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(0); + const int y = get_global_id(1); + const int z = get_global_id(2) % r; + + const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE; + const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id)); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl new file mode 100644 index 0000000000..723c491165 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +/** This function applies upsample on an input image. (NCHW) + * + * @attention The following variables must be passed at compile time: + * -# -DDATA_TYPE = Tensor data type. Supported data types: All + * -# -DVEC_SIZE_IN = Input vector size + * -# -DVEC_SIZE_OUT = Output vector size + * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit) + * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit) + * + * @param[in] src_ptr Pointer to the source image. Supported data types: All + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void upsample_layer_nchw( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); + Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); + +#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN); + const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT); + src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x; + dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x; + + VEC_DATA_TYPE(DATA_TYPE, 8) + data = vload8(0, (__global DATA_TYPE *)src.ptr); + + VEC_DATA_TYPE(DATA_TYPE, 16) + data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7); + + vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr); + vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)); +#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr); + *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr); +#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl new file mode 100644 index 0000000000..85eff9e6d9 --- /dev/null +++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl @@ -0,0 +1,911 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(SRC_DIM_Z) +/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x2_3x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); + + // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y))); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 4) + out0 = 0.0f; + out0.s0 = (w0.s0); + out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f; + out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f; + out0.s3 = (w0.s2); + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 4) + out1 = 0.0f; + out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f; + out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f; + out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f; + out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f; + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 4) + out2 = 0.0f; + out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f; + out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f; + out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f; + out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f; + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 4) + out3 = 0.0f; + out3.s0 = (w2.s0); + out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f; + out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f; + out3.s3 = (w2.s2); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int z = get_global_id(2); + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; + + // Store the values across the channels + // 16 channels for 3x3 kernels + // 4 channels for 3x1 or 1x3 kernels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x4_3x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); + + // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr)); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y))); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 3) + w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + VEC_DATA_TYPE(DATA_TYPE, 3) + w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 8) + out0 = 0.0f; + out0.s0 = (w0.s0) / 16.f; + out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f; + out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f; + out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f; + out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f; + out0.s5 = (w0.s2) / 4.f; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 8) + out1 = 0.0f; + out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f; + out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; + out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; + out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; + out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; + out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f; + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 8) + out2 = 0.0f; + out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f; + out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; + out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; + out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; + out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; + out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f; + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 8) + out3 = 0.0f; + out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f; + out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f; + + // Row 4 + VEC_DATA_TYPE(DATA_TYPE, 8) + out4 = 0.0f; + out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f; + out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; + out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; + out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f; + + // Row 5 + VEC_DATA_TYPE(DATA_TYPE, 8) + out5 = 0.0f; + out5.s0 = (w2.s0) / 4.f; + out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f; + out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f; + out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f; + out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f; + out5.s5 = (w2.s2); +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int z = get_global_id(2); + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; + + // Store the values across the channels + // 36 channels for 3x3 kernels + // 6 channels for 3x1 or 1x3 kernels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4; + *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5; + *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3; + *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4; + *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5; + *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0; + *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1; + *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2; + *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3; + *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4; + *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5; + *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0; + *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1; + *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2; + *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3; + *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4; + *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * + * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x4_5x5_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); + + const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); + + // Load the values from the input tensor +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4); +#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), + *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y))); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); +#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + VEC_DATA_TYPE(DATA_TYPE, 4) + w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); + DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); + DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); + DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); + DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4); + VEC_DATA_TYPE(DATA_TYPE, 4) + w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); + DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4); +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + + // Transform the input tile + + // Row 0 + VEC_DATA_TYPE(DATA_TYPE, 8) + out0 = 0.0f; + out0.s0 = w00.s0; + out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f; + out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f; + out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f; + out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f; + out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f; + out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f; + out0.s7 = w01; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + // Row 1 + VEC_DATA_TYPE(DATA_TYPE, 8) + out1 = 0.0f; + out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f; + out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f; + out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f; + out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f; + out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f; + out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f; + out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f * + (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f; + out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f; + + // Row 2 + VEC_DATA_TYPE(DATA_TYPE, 8) + out2 = 0.0f; + out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f; + out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f; + out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f; + out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f; + out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f; + out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f; + out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f * + (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f; + out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f; + + // Row 3 + VEC_DATA_TYPE(DATA_TYPE, 8) + out3 = 0.0f; + out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f; + out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f; + out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f; + out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f; + out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f; + out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f; + out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + + (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f; + out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f; + + // Row 4 + VEC_DATA_TYPE(DATA_TYPE, 8) + out4 = 0.0f; + out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f; + out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f; + out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f; + out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f; + out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f * + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f; + out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f; + out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * + (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + + (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f; + out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f; + + // Row 5 + VEC_DATA_TYPE(DATA_TYPE, 8) + out5 = 0.0f; + out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f; + out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f; + out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f; + out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; + out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; + out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; + out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + + (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; + out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f; + + // Row 6 + VEC_DATA_TYPE(DATA_TYPE, 8) + out6 = 0.0f; + out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f; + out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f; + out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f; + out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; + out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; + out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; + out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * + (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + + (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; + out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f; + + // Row 7 + VEC_DATA_TYPE(DATA_TYPE, 8) + out7 = 0.0f; + out7.s0 = w40.s0; + out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f; + out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f; + out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f; + out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f; + out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f; + out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f; + out7.s7 = w41; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + + int z = get_global_id(2); + int x0 = z / SRC_DIM_Z; // idx filter + int y0 = z % SRC_DIM_Z; // idx channel + + // Get output address + __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y; + + // Store the values across the channels + *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; + *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; + *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; + *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; + *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; + *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; + *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6; + *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7; + +#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) + *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0; + *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1; + *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2; + *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3; + *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4; + *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5; + *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6; + *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7; + *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0; + *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1; + *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2; + *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3; + *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4; + *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5; + *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6; + *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7; + *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0; + *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1; + *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2; + *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3; + *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4; + *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5; + *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6; + *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7; + *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0; + *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1; + *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2; + *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3; + *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4; + *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5; + *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6; + *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7; + *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0; + *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1; + *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2; + *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3; + *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4; + *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5; + *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6; + *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7; + *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0; + *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1; + *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2; + *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3; + *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4; + *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5; + *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6; + *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7; + *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0; + *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1; + *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2; + *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3; + *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4; + *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5; + *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6; + *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7; +#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +} + +#endif // defined(SRC_DIM_Z) + +#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) +/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_2x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x1_3x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_4x1_5x1_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_5x5_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x2_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_2x2_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x4_1x3_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_3x3_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4 + * + * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 + * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_filter_transform_1x4_1x5_nchw( + TENSOR4D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + winograd_filter_transform_4x4_5x5_nchw(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_offset_first_element_in_bytes); +} + +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl index fbb5e95196..8c382183c3 100644 --- a/src/core/CL/cl_kernels/winograd_input_transform.cl +++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl @@ -908,893 +908,6 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw( #endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) } -#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y) -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM - const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. -#define _ISRC_WIDTH SRC_WIDTH -#define _ISRC_HEIGHT SRC_HEIGHT -#define _INUM_TILES_X NUM_TILES_X -#define _INUM_TILES_Y NUM_TILES_Y - - int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; - int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; - x -= PAD_LEFT; - y -= PAD_TOP; - -#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 6, 1, in); - TILE(DATA_TYPE, 6, 1, out); - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 6, - { - in[i].v = 0; - }) - -#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); -#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); -#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - - TILE(DATA_TYPE, 6, 1, com); - - LOOP_UNROLLING(int, i, 0, 1, 6, - { - in[i].v *= 4.0f; - }) - - com[0].v = in[2].v - 4.f * in[0].v; - com[1].v = in[3].v - 4.f * in[1].v; - com[2].v = in[4].v - 4.f * in[2].v; - com[3].v = in[5].v - 4.f * in[3].v; - com[4].v = in[3].v - in[1].v; - com[4].v = com[4].v + com[4].v; - com[5].v = in[4].v - in[2].v; - - out[0].v = com[2].v - com[0].v; - out[1].v = com[2].v + com[1].v; - out[2].v = com[2].v - com[1].v; - out[3].v = com[5].v + com[4].v; - out[4].v = com[5].v - com[4].v; - out[5].v = com[3].v - com[1].v; - - TILE(uint, 6, 1, dst_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 6, - { - dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y; - dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 6; - }) - - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 36, 1, in); - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 36, - { - in[i].v = 0; - }) - - // Load the tile from a NHWC tensor - T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); - - TILE(DATA_TYPE, 6, 1, com); - TILE(DATA_TYPE, 36, 1, tmp); - - LOOP_UNROLLING(int, i, 0, 1, 6, - { - com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v; - com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v; - com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v; - com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v; - com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v; - com[4].v = com[4].v + com[4].v; - com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v; - tmp[i + 0 * 6].v = com[2].v - com[0].v; - tmp[i + 1 * 6].v = com[2].v + com[1].v; - tmp[i + 2 * 6].v = com[2].v - com[1].v; - tmp[i + 3 * 6].v = com[5].v + com[4].v; - tmp[i + 4 * 6].v = com[5].v - com[4].v; - tmp[i + 5 * 6].v = com[3].v - com[1].v; - }) - - TILE(DATA_TYPE, 36, 1, out); - - LOOP_UNROLLING(int, i, 0, 1, 6, - { - com[0].v = tmp[i * 6 + 2].v - 4.f * tmp[i * 6 + 0].v; - com[1].v = tmp[i * 6 + 3].v - 4.f * tmp[i * 6 + 1].v; - com[2].v = tmp[i * 6 + 4].v - 4.f * tmp[i * 6 + 2].v; - com[3].v = tmp[i * 6 + 5].v - 4.f * tmp[i * 6 + 3].v; - com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v; - com[4].v = com[4].v + com[4].v; - com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v; - out[i * 6 + 0].v = com[2].v - com[0].v; - out[i * 6 + 1].v = com[2].v + com[1].v; - out[i * 6 + 2].v = com[2].v - com[1].v; - out[i * 6 + 3].v = com[5].v + com[4].v; - out[i * 6 + 4].v = com[5].v - com[4].v; - out[i * 6 + 5].v = com[3].v - com[1].v; - }) - - // Compute destination address - TILE(uint, 36, 1, dst_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 36, - { - dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y; - dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 36; - }) - - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); -#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM - const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. -#define _ISRC_WIDTH SRC_WIDTH -#define _ISRC_HEIGHT SRC_HEIGHT -#define _INUM_TILES_X NUM_TILES_X -#define _INUM_TILES_Y NUM_TILES_Y - - int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; - int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; - x -= PAD_LEFT; - y -= PAD_TOP; - -#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 8, 1, in); - TILE(DATA_TYPE, 8, 1, out); - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 8, - { - in[i].v = 0; - }) - -#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); -#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); -#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - - TILE(DATA_TYPE, 1, 8, com); - - com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v; - com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v; - com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v; - com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v; - com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; - com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v; - out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v; - out[1].s[0] = com[0].s[0] + com[0].s[1]; - out[2].s[0] = com[0].s[0] - com[0].s[1]; - out[3].s[0] = com[0].s[3] + com[0].s[2]; - out[4].s[0] = com[0].s[3] - com[0].s[2]; - out[5].s[0] = com[0].s[4] + com[0].s[5]; - out[6].s[0] = com[0].s[4] - com[0].s[5]; - out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v; - - TILE(uint, 8, 1, dst_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y; - dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8; - }) - - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 64, 1, in); - TILE(DATA_TYPE, 64, 1, out); - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 64, - { - in[i].v = 0; - }) - - // Load the tile from a NHWC tensor - T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); - - TILE(DATA_TYPE, 8, 8, com); - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x - com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x - com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x - com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x - com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; - com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0]; - com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0]; - com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0]; - }) - - TILE(DATA_TYPE, 8, 8, tmp); - tmp[0].v = com[6].v; - tmp[1].v = com[0].v + com[1].v; - tmp[2].v = com[0].v - com[1].v; - tmp[3].v = com[2].v + com[3].v; - tmp[4].v = com[2].v - com[3].v; - tmp[5].v = com[4].v + com[5].v; - tmp[6].v = com[4].v - com[5].v; - tmp[7].v = com[7].v; - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6]; - com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5]; - com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; - com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6]; - com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; - com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5]; - out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6]; - out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1]; - out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1]; - out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2]; - out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2]; - out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5]; - out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5]; - out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7]; - }) - - TILE(uint, 64, 1, dst_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 64, - { - dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y; - dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64; - }) - - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM - const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - - // All the tensor dimensions are passed at compile time. - // In case of dynamic tensor support, the following dimensions should be passed as function argument. -#define _ISRC_WIDTH SRC_WIDTH -#define _ISRC_HEIGHT SRC_HEIGHT -#define _INUM_TILES_X NUM_TILES_X -#define _INUM_TILES_Y NUM_TILES_Y - - int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; - int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; - x -= PAD_LEFT; - y -= PAD_TOP; - -#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 8, 1, in); - TILE(DATA_TYPE, 8, 1, out); - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 8, - { - in[i].v = 0; - }) - -#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); -#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); -#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - in[i].v *= (DATA_TYPE) - 36.0f; - }) - - TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } }; - - com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v; - com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v; - com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v; - com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v; - com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; - com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v; - out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v; - out[1].s[0] = com[0].s[0] - com[0].s[1]; - out[2].s[0] = com[0].s[0] + com[0].s[1]; - out[3].s[0] = com[0].s[2] - com[0].s[3]; - out[4].s[0] = com[0].s[2] + com[0].s[3]; - out[5].s[0] = com[0].s[4] - com[0].s[5]; - out[6].s[0] = com[0].s[4] + com[0].s[5]; - out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v; - - TILE(uint, 8, 1, dst_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y; - dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8; - }) - - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 64, 1, in); - TILE(DATA_TYPE, 64, 1, out); - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 64, - { - in[i].v = 0; - }) - - // Load the tile from a NHWC tensor - T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); - - TILE(DATA_TYPE, 8, 8, com); - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; - com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; - com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; - com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; - com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; - com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0]; - com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0]; - com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0]; - }) - - TILE(DATA_TYPE, 8, 8, tmp); - tmp[0].v = com[6].v; - tmp[1].v = com[0].v - com[1].v; - tmp[2].v = com[0].v + com[1].v; - tmp[3].v = com[2].v - com[3].v; - tmp[4].v = com[2].v + com[3].v; - tmp[5].v = com[4].v - com[5].v; - tmp[6].v = com[4].v + com[5].v; - tmp[7].v = com[7].v; - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6]; - com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5]; - com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6]; - com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; - com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; - com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5]; - out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6]; - out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1]; - out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1]; - out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3]; - out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3]; - out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5]; - out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5]; - out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7]; - }) - - TILE(uint, 64, 1, dst_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 64, - { - dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y; - dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64; - }) - - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes); -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes); -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes); -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes); -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes); -} - -//! @cond Doxygen_Suppress -/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC - * - * @note Data layout supported: NHWC - * @note Data type supported: F32/F16 - * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) - * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). - * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) - * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time - * - * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -//! @endcond -__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER)) -{ - winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes); -} -#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y) - #if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) /** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1 * diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl index 6a3e6d3346..861ed50651 100644 --- a/src/core/CL/cl_kernels/winograd_output_transform.cl +++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl @@ -176,181 +176,6 @@ __kernel void winograd_output_transform_2x2_3x3_nchw( (__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y)); #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) } - -/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 - * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_2x2_7x7_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ -#define _ISRC_HEIGHT SRC_HEIGHT -#define _IDST_WIDTH DST_WIDTH -#define _IDST_HEIGHT DST_HEIGHT -#define _INUM_TILES_X NUM_TILES_X - - const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM - const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - - int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; - int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; - -#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 8, N0, in); - TILE(DATA_TYPE, 2, N0, out); - TILE(uint, 8, 1, src_indirect_y); - - // Calculate the indirect Y for the source tensor - LOOP_UNROLLING(int, i, 0, 1, 8, - { - src_indirect_y[i].v = mout + i * _ISRC_HEIGHT; - src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8); - }) - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 8, - { - in[i].v = 0; - }) - - // Load the values across the 8 channels to compose the 8x1 tile - T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); - - // Compute out0 and out01 - out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v; - out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v; - -#if defined(HAS_BIAS) - // Add bias - TILE(DATA_TYPE, 1, N0, b); - - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - - T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out); -#endif // defined(HAS_BIAS) - - T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - - TILE(uint, 2, 1, dst_indirect_y); - -#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - LOOP_UNROLLING(int, yk, 0, 1, 2, - { - int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); - dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH); - }) -#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - LOOP_UNROLLING(int, xk, 0, 1, 2, - { - int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); - dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH); - }) -#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - - // Store the tile in reverse order so the invalid values are overwritten with the valid ones - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 64, N0, in); - TILE(DATA_TYPE, 4, N0, out); - TILE(DATA_TYPE, 16, N0, tmp); - TILE(uint, 64, 1, src_indirect_y); - - // Calculate the indirect Y for the source tensor - LOOP_UNROLLING(int, i, 0, 1, 64, - { - src_indirect_y[i].v = mout + i * _ISRC_HEIGHT; - src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64); - }) - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 64, - { - in[i].v = 0; - }) - - // Load the values across the 64 channels to compose the 8x8 tile - T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v; - tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v; - }) - - // Compute the 2x2 output tile - LOOP_UNROLLING(int, i, 0, 1, 2, - { - out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v; - out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v; - }) - -#if defined(HAS_BIAS) - // Add bias - TILE(DATA_TYPE, 1, N0, b); - - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - - T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); -#endif // defined(HAS_BIAS) - - T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - - TILE(uint, 4, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, yk, 0, 1, 2, - { - LOOP_UNROLLING(int, xk, 0, 1, 2, - { - int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); - int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); - dst_indirect_y[xk + yk * 2].v = x_c + y_c * _IDST_WIDTH; - dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); - }) - }) - - // Store the tile in reverse order so the invalid values are overwritten with the valid ones - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); -#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) -} #endif // defined(VEC_SIZE) && VEC_SIZE == 2 #if defined(VEC_SIZE) && VEC_SIZE == 4 @@ -577,200 +402,6 @@ __kernel void winograd_output_transform_4x4_3x3_nchw( #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) } -/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] dst_size Size of the destination tensor, minus the last padding - */ -__kernel void winograd_output_transform_4x4_3x3_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM - const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - -#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - - TILE(DATA_TYPE, 6, N0, in); - TILE(DATA_TYPE, 4, N0, out); - TILE(uint, 6, 1, src_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 6, - { - src_indirect_y[i].v = mout + i * SRC_HEIGHT; - src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6); - }) - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 6, - { - in[i].v = 0; - }) - - // Load the values across the 36 channels to compose the 6x6 or 6x1 tile - T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); - - // Compute out00, out01, out02 and out03 - out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v; - out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v; - out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v; - out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v; - -#if defined(HAS_BIAS) - TILE(DATA_TYPE, 1, N0, b); - - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - - // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); -#endif // HAS_BIAS - - int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; - int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; - - T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - - TILE(uint, 4, 1, dst_indirect_y); - - // Calculate the destination indirect Y -#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - LOOP_UNROLLING(int, yk, 0, 1, 4, - { - int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); - dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH; - dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); - }) -#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - LOOP_UNROLLING(int, xk, 0, 1, 4, - { - int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); - dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH; - dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); - }) -#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - - // Store the tile in reverse order so the invalid values are overwritten with the valid ones - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - - // Calculate the indirect Y for the source tensor - TILE(DATA_TYPE, 36, N0, in); - TILE(DATA_TYPE, 4, N0, tmp); - TILE(uint, 36, 1, src_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 36, - { - src_indirect_y[i].v = mout + i * SRC_HEIGHT; - src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36); - }) - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 36, - { - in[i].v = 0; - }) - - // Load the values across the 36 channels to compose the 6x6 or 6x1 tile - T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); - - LOOP_UNROLLING(int, i, 0, 1, 6, - { - tmp[0].v = in[6 + i].v + in[12 + i].v; - tmp[1].v = in[6 + i].v - in[12 + i].v; - tmp[2].v = in[18 + i].v + in[24 + i].v; - tmp[3].v = in[18 + i].v - in[24 + i].v; - tmp[3].v = tmp[3].v + tmp[3].v; - in[i].v = in[i].v + tmp[0].v + tmp[2].v; - in[6 + i].v = tmp[3].v + tmp[1].v; - in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); - in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v; - }) - - // Compute the output tile - TILE(DATA_TYPE, 16, N0, out); - - LOOP_UNROLLING(int, i, 0, 1, 4, - { - tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v; - tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v; - tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v; - tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v; - tmp[3].v = tmp[3].v + tmp[3].v; - out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v; - out[4 * i + 1].v = tmp[3].v + tmp[1].v; - out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); - out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v; - }) - -#if defined(HAS_BIAS) - TILE(DATA_TYPE, 1, N0, b); - - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - - // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); -#endif // HAS_BIAS - - int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; - int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; - - T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - - TILE(uint, 16, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, yk, 0, 1, 4, - { - LOOP_UNROLLING(int, xk, 0, 1, 4, - { - int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); - int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); - dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH; - dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); - }) - }) - - // Store the tile in reverse order so the invalid values are overwritten with the valid ones - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); -#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) -} - #define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \ ({ \ comm_fact.s0 = d1 + d2; \ @@ -1023,214 +654,6 @@ __kernel void winograd_output_transform_4x4_5x5_nchw( 0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y)); #endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) } - -/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_4x4_5x5_nhwc( - TENSOR4D(src, BUFFER), - TENSOR4D(dst, BUFFER), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM - const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES - const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX - -#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - TILE(DATA_TYPE, 8, N0, in); - TILE(DATA_TYPE, 4, N0, out); - TILE(DATA_TYPE, 4, N0, tmp); - TILE(uint, 8, 1, src_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 8, - { - src_indirect_y[i].v = mout + i * SRC_HEIGHT; - src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8); - }) - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 8, - { - in[i].v = 0; - }) - - // "in" contains 1x8 or 8x1 tile here - T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); - - // A^T * in, and in this degenerate case out consists of 1 column/row - tmp[0].v = in[1].v - in[2].v; - tmp[1].v = 2.0f * (in[3].v - in[4].v); - tmp[2].v = 2.0f * (in[5].v + in[6].v); - tmp[3].v = in[3].v + in[4].v; - out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v; - out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v); - out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v; - out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v; - -#if defined(HAS_BIAS) - TILE(DATA_TYPE, 1, N0, b); - - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - - // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); -#endif // HAS_BIAS - - int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; - int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; - - T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - - TILE(uint, 4, 1, dst_indirect_y); - - // Calculate the destination indirect Y -#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - LOOP_UNROLLING(int, yk, 0, 1, 4, - { - int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); - dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH; - dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); - }) -#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - LOOP_UNROLLING(int, xk, 0, 1, 4, - { - int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); - dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH; - dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); - }) -#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - - // Store the tile in reverse order so the invalid values are overwritten with the valid ones - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); - -#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) - // Calculate the indirect Y for the source tensor - TILE(DATA_TYPE, 64, N0, in); - TILE(DATA_TYPE, 6, N0, tmp); - TILE(uint, 64, 1, src_indirect_y); - - LOOP_UNROLLING(int, i, 0, 1, 64, - { - src_indirect_y[i].v = mout + i * SRC_HEIGHT; - src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64); - }) - - // Initialize the input tile - LOOP_UNROLLING(int, i, 0, 1, 64, - { - in[i].v = 0; - }) - - // "in" here is 8x8 tile - T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); - - // A^T * in - LOOP_UNROLLING(int, i, 0, 1, 8, - { - tmp[0].v = in[8 + i].v + in[16 + i].v; - tmp[1].v = in[8 + i].v - in[16 + i].v; - tmp[2].v = in[24 + i].v + in[32 + i].v; - tmp[3].v = in[24 + i].v - in[32 + i].v; - tmp[3].v = tmp[3].v + tmp[3].v; - tmp[4].v = in[40 + i].v + in[48 + i].v; - tmp[4].v = tmp[4].v + tmp[4].v; - tmp[5].v = in[40 + i].v - in[48 + i].v; - - // 4x8 matrix as a result - in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); - in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); - in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v); - in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v; - }) - - // Compute the output tile - TILE(DATA_TYPE, 16, N0, out); - - // in * A, with in = A^T * in as above - LOOP_UNROLLING(int, i, 0, 1, 4, - { - tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v; - tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v; - tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v; - tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v; - tmp[3].v = tmp[3].v + tmp[3].v; - tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v; - tmp[4].v = tmp[4].v + tmp[4].v; - tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v; - - // 4x4 tile - out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); - out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); - out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v; - out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v; - }) - -#if defined(HAS_BIAS) - TILE(DATA_TYPE, 1, N0, b); - - T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); - - // c = c + bias[broadcasted] - T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); -#endif // HAS_BIAS - - int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; - int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; - - T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); - - TILE(uint, 16, 1, dst_indirect_y); - - // Calculate the destination indirect Y - LOOP_UNROLLING(int, yk, 0, 1, 4, - { - LOOP_UNROLLING(int, xk, 0, 1, 4, - { - int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); - int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); - dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH; - dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); - }) - }) - - // Store the tile in reverse order so the invalid values are overwritten with the valid ones - T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); -#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) -} #endif // defined(VEC_SIZE) && VEC_SIZE == 4 #if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) @@ -1303,73 +726,6 @@ __kernel void winograd_output_transform_2x1_3x1_nchw( ); } -/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_2x1_7x1_nhwc( - TENSOR4D_DECLARATION(src), - TENSOR4D_DECLARATION(dst), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - winograd_output_transform_2x2_7x7_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes, -#if defined(HAS_BIAS) - bias_ptr, - bias_stride_x, - bias_step_x, - bias_offset_first_element_in_bytes, -#endif // defined(HAS_BIAS) - dst_size); -} #endif // defined(VEC_SIZE) && VEC_SIZE == 2 #if defined(VEC_SIZE) && VEC_SIZE == 4 @@ -1509,141 +865,6 @@ __kernel void winograd_output_transform_4x1_5x1_nchw( ); } -/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_4x1_3x1_nhwc( - TENSOR4D_DECLARATION(src), - TENSOR4D_DECLARATION(dst), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - winograd_output_transform_4x4_3x3_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes, -#if defined(HAS_BIAS) - bias_ptr, - bias_stride_x, - bias_step_x, - bias_offset_first_element_in_bytes, -#endif // defined(HAS_BIAS) - dst_size); -} - -/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_4x1_5x1_nhwc( - TENSOR4D_DECLARATION(src), - TENSOR4D_DECLARATION(dst), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - winograd_output_transform_4x4_5x5_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes, -#if defined(HAS_BIAS) - bias_ptr, - bias_stride_x, - bias_step_x, - bias_offset_first_element_in_bytes, -#endif // defined(HAS_BIAS) - dst_size); -} #endif // defined(VEC_SIZE) && VEC_SIZE == 4 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) @@ -1717,73 +938,6 @@ __kernel void winograd_output_transform_1x2_1x3_nchw( ); } -/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_1x2_1x7_nhwc( - TENSOR4D_DECLARATION(src), - TENSOR4D_DECLARATION(dst), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - winograd_output_transform_2x2_7x7_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes, -#if defined(HAS_BIAS) - bias_ptr, - bias_stride_x, - bias_step_x, - bias_offset_first_element_in_bytes, -#endif // defined(HAS_BIAS) - dst_size); -} #endif // defined(VEC_SIZE) && VEC_SIZE == 2 #if defined(VEC_SIZE) && VEC_SIZE == 4 @@ -1923,141 +1077,6 @@ __kernel void winograd_output_transform_1x4_1x5_nchw( ); } -/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_1x4_1x3_nhwc( - TENSOR4D_DECLARATION(src), - TENSOR4D_DECLARATION(dst), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - winograd_output_transform_4x4_3x3_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes, -#if defined(HAS_BIAS) - bias_ptr, - bias_stride_x, - bias_step_x, - bias_offset_first_element_in_bytes, -#endif // defined(HAS_BIAS) - dst_size); -} - -/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC - * - * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 - * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 - * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 - * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 - * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 - * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_output_transform_1x4_1x5_nhwc( - TENSOR4D_DECLARATION(src), - TENSOR4D_DECLARATION(dst), -#if defined(HAS_BIAS) - VECTOR_DECLARATION(bias), -#endif // defined(HAS_BIAS) - int dst_size) -{ - winograd_output_transform_4x4_5x5_nhwc(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_stride_w, - dst_step_w, - dst_offset_first_element_in_bytes, -#if defined(HAS_BIAS) - bias_ptr, - bias_stride_x, - bias_step_x, - bias_offset_first_element_in_bytes, -#endif // defined(HAS_BIAS) - dst_size); -} #endif // defined(VEC_SIZE) && VEC_SIZE == 4 #endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) #endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) diff --git a/src/core/CL/cl_kernels/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl index 1217a37345..a5334525fe 100644 --- a/src/core/CL/cl_kernels/space_to_depth.cl +++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2019-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,12 +23,12 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) -/** Space to depth transformation. (NCHW) +#if defined(DATA_TYPE) && defined(BATCH_SIZE) +/** Batch to space transformation. (NHWC) * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 - * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 * * @param[in] input_ptr Pointer to the source tensor. Supported data types: All * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) @@ -39,6 +39,12 @@ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor * @param[in] batch_id The input tensor batch id + * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32 + * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) @@ -48,29 +54,39 @@ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void space_to_depth_nchw( - TENSOR4D_DECLARATION(input), +__kernel void batch_to_space_nhwc( + TENSOR3D_DECLARATION(input), const int batch_id, - TENSOR3D_DECLARATION(output)) + VECTOR_DECLARATION(block_shape), + TENSOR4D_DECLARATION(output)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); - const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); - const int x = get_global_id(0); - const int y = get_global_id(1); - const int z = get_global_id(2) % r; + const int block_x = *((__global int *)vector_offset(&block, 0)); + const int block_y = *((__global int *)vector_offset(&block, 1)); - const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE; - const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE; + const int r = (BATCH_SIZE / (block_x * block_y)); + const int x = get_global_id(1); + const int y = get_global_id(2); + const int z = get_global_id(0); + const int w = batch_id % r; - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id)); + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; + + *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr); } -/** Space to depth transformation. (NHWC) +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) + +#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) +/** Batch to space transformation. (NHWC) * * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 - * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 + * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 * * @param[in] input_ptr Pointer to the source tensor. Supported data types: All * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) @@ -90,22 +106,26 @@ __kernel void space_to_depth_nchw( * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor */ -__kernel void space_to_depth_nhwc( - TENSOR4D_DECLARATION(input), +__kernel void batch_to_space_static_nhwc( + TENSOR3D_DECLARATION(input), const int batch_id, - TENSOR3D_DECLARATION(output)) + TENSOR4D_DECLARATION(output)) { - Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + const int block_x = BLOCK_SHAPE_X; + const int block_y = BLOCK_SHAPE_Y; - const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int r = (BATCH_SIZE / (block_x * block_y)); const int x = get_global_id(1); const int y = get_global_id(2); - const int z = get_global_id(0) % r; + const int z = get_global_id(0); + const int w = batch_id % r; - const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE; - const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE; + const int out_x = x * block_x + (batch_id / r) % block_x; + const int out_y = y * block_y + (batch_id / r) / block_x; - *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id)); + *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr); } -#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file +#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl new file mode 100644 index 0000000000..cb2da1bd99 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl @@ -0,0 +1,146 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#define ADD_OP(a, b) ((a) + (b)) +#define SUB_OP(a, b) ((a) - (b)) +#define MUL_OP(a, b) ((a) * (b)) +#define INVSQRT_OP(a) rsqrt((a)) +#define SQCVT_SAT(a) (a) + +#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE) +#include "activation_float_helpers.h" + +/** Apply batch normalization on tensors with NHWC format. + * + * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu + * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively + * + * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 + * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr + * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) + * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor + * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr + * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes) + * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor + * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr + * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes) + * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor + * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr + * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes) + * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor + * @param[in] epsilon Epsilon parameter in the batch normalization equation + */ +__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input), +#ifndef IN_PLACE + TENSOR3D_DECLARATION(output), +#endif /* not IN_PLACE */ + VECTOR_DECLARATION(mean), + VECTOR_DECLARATION(var), +#ifndef USE_DEFAULT_BETA + VECTOR_DECLARATION(beta), +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + VECTOR_DECLARATION(gamma), +#endif /* USE_DEFAULT_GAMMA */ + float epsilon) +{ + uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0); + + __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z; +#ifdef IN_PLACE + __global uchar *output_addr = input_ptr; +#else /* IN_PLACE */ + __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z; +#endif /* IN_PLACE */ + __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs; + __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs; +#ifndef USE_DEFAULT_BETA + __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs; +#endif /* USE_DEFAULT_BETA */ +#ifndef USE_DEFAULT_GAMMA + __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs; +#endif /* USE_DEFAULT_GAMMA */ + + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + data = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + denominator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + numerator = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + x_bar = 0; + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res0 = 0; + + data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr); + denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr); + denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon)))); + + // Calculate x bar and store results + numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr); + numerator = SUB_OP(data, numerator); + x_bar = MUL_OP(numerator, denominator); + +#ifndef USE_DEFAULT_GAMMA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr); + + res0 = MUL_OP(gamma_vec, x_bar); +#else /* USE_DEFAULT_GAMMA */ + // gamma is equal to 1, no need to perform multiplications + res0 = x_bar; +#endif /* USE_DEFAULT_GAMMA */ + +#ifndef USE_DEFAULT_BETA + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr); + // beta is not zero, hence we need to perform the addition + res0 = ADD_OP(res0, beta_vec); +#endif /* USE_DEFAULT_BETA */ + + res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL); + + STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0) +} +#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl index 63af2c6137..233beb3aa9 100644 --- a/src/core/CL/cl_kernels/channel_shuffle.cl +++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl @@ -38,68 +38,6 @@ mod_res = (x)-r; \ }) -/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details. - * - * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4 - * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64 - * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2 - * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1 - * K is equal to num_channels / num_groups. - * - * @param[in] src_ptr Pointer to the source matrix. Supported data types: All - * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes) - * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src), - TENSOR4D_DECLARATION(dst)) -{ - uint curr_channel = 0; // channel id of input - uint batch_id = 0; // batch id - uint group_id = 0; // group id - uint channel_id = 0; // channel id within the group - - // Compute curr_channel and batch_id - DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel); - - // Compute group_id and channel_id - DIV_MOD_UINT(curr_channel, K, group_id, channel_id); - - const uint x = get_global_id(0) * VEC_SIZE; - const uint y = get_global_id(1) * 2; - const uint z = channel_id * NUM_GROUPS + group_id; - - // Load the Nx2 block - const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w; - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y)); - - // Store blocks - __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w; - VSTORE(VEC_SIZE) - (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y)); - VSTORE(VEC_SIZE) - (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y)); -} - #if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X) /** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details. @@ -219,4 +157,4 @@ __kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src), STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } #endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X) -#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl new file mode 100644 index 0000000000..5464a4bef8 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Depth to space transformation. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All. + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void depth_to_space_nhwc( + TENSOR3D_DECLARATION(input), + const int batch_id, + TENSOR4D_DECLARATION(output)) +{ + Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(1); + const int y = get_global_id(2); + const int z = get_global_id(0) % r; + + const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE; + const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl new file mode 100644 index 0000000000..238d3a7921 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) +/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC) + * + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale Pointer to buffer with the per channel quantized scales + */ +__kernel void dequantization_layer_per_channel_nhwc( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + __global float *scale) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + scale -= max(xi - (int)LAST_ACCESSED_X, 0); + + // Load data + VEC_DATA_TYPE(int, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + + // Create scale vectors + const VEC_DATA_TYPE(float, VEC_SIZE) + vscale = VLOAD(VEC_SIZE)(0, &scale[xi]); + + // Dequantize + VEC_DATA_TYPE(float, VEC_SIZE) + res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE)); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); +#else // !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]); +#endif // defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl index 75a7a0f004..75a7a0f004 100644 --- a/src/core/CL/cl_kernels/direct_convolution.cl +++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl index 1f940001f3..d2e7e45ada 100644 --- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl @@ -125,7 +125,7 @@ __kernel void dwc_native_fp_nhwc( const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX #else // defined(BATCHED_EXECUTION) const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT - const int bout = 0; // BATCH SIZE IDX + const int bout = 0; // BATCH SIZE IDX #endif // defined(BATCHED_EXECUTION) int xi = xo * STRIDE_X; @@ -148,37 +148,37 @@ __kernel void dwc_native_fp_nhwc( #if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, -#else // _IWEI_HEIGHT <= 5 +#else // _IWEI_HEIGHT <= 5 for(int yk = 0; yk < _IWEI_HEIGHT; yk++) #endif // _IWEI_HEIGHT <= 5 - { - TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); + { + TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); - LOOP_UNROLLING(int, i, 0, 1, _IM0_A, - { - a[i].v = 0; - }) + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + { + a[i].v = 0; + }) - // Load tile from the src tensor (TILE A) - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a); + // Load tile from the src tensor (TILE A) + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a); - TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); + TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); - // Load tile from the weights tensor (TILE B) - T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b); + // Load tile from the weights tensor (TILE B) + T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b); - // Optimized path for STRIDE_X == 1 - // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension - LOOP_UNROLLING(int, m0, 0, 1, M0, + // Optimized path for STRIDE_X == 1 + // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, { - LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, - { - c[m0].v += a[xk + m0].v * b[xk].v; - }) + c[m0].v += a[xk + m0].v *b[xk].v; }) - } + }) + } #if _IWEI_HEIGHT <= 5 - ) + ) #endif // _IWEI_HEIGHT <= 5 #if defined(HAS_BIAS) diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl index aa6ba4de39..1bc58b6e26 100644 --- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl +++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl @@ -179,61 +179,61 @@ __kernel void dwc_native_quantized_nhwc( #if _IWEI_HEIGHT <= 5 LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT, -#else // _IWEI_HEIGHT <= 5 +#else // _IWEI_HEIGHT <= 5 for(int yk = 0; yk < _IWEI_HEIGHT; yk++) #endif // _IWEI_HEIGHT <= 5 - { - TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); + { + TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a); - LOOP_UNROLLING(int, i, 0, 1, _IM0_A, - { - a[i].v = ZERO_VALUE; - }) + LOOP_UNROLLING(int, i, 0, 1, _IM0_A, + { + a[i].v = ZERO_VALUE; + }) - // Load tile from the src tensor (TILE A) - T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a); + // Load tile from the src tensor (TILE A) + T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a); - TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); + TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b); - // Load tile from the weights tensor (TILE B) - T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b); + // Load tile from the weights tensor (TILE B) + T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b); - // Optimized path for STRIDE_X == 1 - // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension - LOOP_UNROLLING(int, m0, 0, 1, M0, + // Optimized path for STRIDE_X == 1 + // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension + LOOP_UNROLLING(int, m0, 0, 1, M0, + { + LOOP_UNROLLING(int, n0, 0, 1, N0, { - LOOP_UNROLLING(int, n0, 0, 1, N0, - { #if _IWEI_WIDTH <= 16 #define DOT_DATA_TYPE SRC_DATA_TYPE #define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE)) - // Optimized path for the dot instruction - TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0); - TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0); - ACC_DATA_TYPE offset_a = 0; - ACC_DATA_TYPE offset_b = 0; + // Optimized path for the dot instruction + TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0); + TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0); + ACC_DATA_TYPE offset_a = 0; + ACC_DATA_TYPE offset_b = 0; - LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, - { - x0[0].s[xk] = a[xk + m0].s[n0]; - y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION; - }) - DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]); - REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a); - REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b); - c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET; + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { + x0[0].s[xk] = a[xk + m0].s[n0]; + y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION; + }) + DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]); + REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a); + REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b); + c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET; #else // _IWEI_WIDTH <= 16 - LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, - { - c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET)); - }) -#endif // _IWEI_WIDTH <= 16 + LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH, + { + c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET)); }) +#endif // _IWEI_WIDTH <= 16 }) - } + }) + } #if _IWEI_HEIGHT <= 5 - ) + ) #endif // _IWEI_HEIGHT <= 5 #if _IWEI_WIDTH <= 16 diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl new file mode 100644 index 0000000000..ac00c11283 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/im2col.cl @@ -0,0 +1,532 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#if defined(DATA_TYPE) && defined(ELEMENT_SIZE) + +#if ELEMENT_SIZE == 1 +#define COND_DATA_TYPE char +#elif ELEMENT_SIZE == 2 +#define COND_DATA_TYPE short +#elif ELEMENT_SIZE == 4 +#define COND_DATA_TYPE int +#else // ELEMENT_SIZE +#error "Element size not support" +#endif // ELEMENT_SIZE + +#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) + +#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE) +#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE) + +/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension + * @name IM2COL1X9_NHWC_STORE + * + * @note To use this macro for a 3x3 block, @p ROW has to be 0 + * + * @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16 + * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size + * @param[in] DATA_TYPE Data type of @p DATA + * @param[in] SRC_DEPTH Input channel size / depth + * @param[in] DATA Value variable base name + * @param[in] ROW The row number to store. Supported: 0-8 + * @param[in] OUTPUT_PTR Output pointer + * @{ + */ +#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE +#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + const bool at_channel_boundary = get_global_id(0) == 0; \ + if(at_channel_boundary) \ + { \ + IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + } \ + else \ + { \ + IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + } +#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE +#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) +#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE + +#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + VSTORE(VECTOR_SIZE) \ + (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ + VSTORE(VECTOR_SIZE) \ + (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); + +#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \ + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \ + (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH); +/** @}*/ + +/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC + * + * @note This kernel computes VECTOR_SIZE elements + * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements + * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 + * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col3x3_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding + const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; + const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size + + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int yi_coord = 0; + int3 offset = 0; + + // Clamp xi + int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT); +#if PAD_LEFT != 0 || PAD_RIGHT != 0 +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1)); +#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 + // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor + xi_offset *= (int3)src_stride_y; + + // Out-of-bound condition for X + int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH); + + // yi == 0 + // Clamp yi + // yi_coord is casted to unsigned int in order to use just a min() operation + // A "-1" 32 bit signed variable converted to unsigned gives 4294967295 + // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1), + // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around + // also causes y_cond (y padding condition) to be satisfied + yi_coord = yi - (int)PAD_TOP; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT)); + values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); + values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); + values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 1 + // Clamp yi_coord (it can be negative if PAD_TOP > 1) + yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with zeros + y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT)); + values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); + values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); + values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // yi == 2 + // Clamp yi_coord + yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y; + + // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0 +#if PAD_TOP != 0 || PAD_BOTTOM != 0 + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); +#endif // PAD_TOP != 0 || PAD_BOTTOM != 0 + + // Compute offset + offset = xi_offset + (yi_coord * (int)src_stride_z); + + // Load input values + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0)); + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1)); + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2)); + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + // Replace invalid values with PAD_VALUE + y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT)); + values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0))); + values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1))); + values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2))); +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + + // Store in a boundary-aware way to avoid padding + IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr) + +#ifdef HAS_BIAS + // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is + // added at the end of the channel, while the boundary vec is at the beginning of the channel. + // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in + // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE + // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp + if((ch + VECTOR_SIZE) >= SRC_DEPTH) + { + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f; + } +#endif // HAS_BIAS +} + +#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 +#define IM2COL1x9(i) \ + ({ \ + yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ + \ + offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ + offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ + \ + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ + \ + int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \ + values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \ + values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \ + values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \ + values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \ + values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \ + values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \ + values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \ + values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \ + values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \ + \ + IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ + }) +#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 +#define IM2COL1x9(i) \ + ({ \ + yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \ + yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \ + \ + offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \ + offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \ + \ + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \ + VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \ + VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \ + VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \ + VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \ + VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \ + VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \ + VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \ + VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \ + \ + IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \ + }) +#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0 + +/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC + * + * @note This kernel computes VECTOR_SIZE elements + * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements + * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 + * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3 + * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col9x9_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding + const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; + const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size + + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int yi_coord = 0; + int8 offset0 = 0; + int offset1 = 0; + + // Clamp xi + int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT); + int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT); + +#if PAD_LEFT != 0 || PAD_RIGHT != 0 +#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val) + xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1)); + xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1)); +#endif // PAD_LEFT != 0 || PAD_RIGHT != 0 + xi_offset0 *= (int8)src_stride_y; + xi_offset1 *= (int)src_stride_y; + + // Out-of-bound condition for X + int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH); + int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); + + IM2COL1x9(0); + IM2COL1x9(1); + IM2COL1x9(2); + IM2COL1x9(3); + IM2COL1x9(4); + IM2COL1x9(5); + IM2COL1x9(6); + IM2COL1x9(7); + IM2COL1x9(8); + +#ifdef HAS_BIAS + // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is + // added at the end of the channel, while the boundary vec is at the beginning of the channel. + // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in + // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE + // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp + if((ch + VECTOR_SIZE) >= SRC_DEPTH) + { + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f; + } +#endif // HAS_BIAS +} + +/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC + * + * @note This kernel computes VECTOR_SIZE elements + * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements + * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2 + * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1 + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128 + * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34 + * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64 + * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2 + * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0 + * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1 + * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1 + * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes). + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes). + */ +__kernel void im2col_generic_nhwc( + TENSOR3D_DECLARATION(src), + IMAGE_DECLARATION(dst), + uint src_stride_w, + uint dst_stride_w) +{ + // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding + const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE; + const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0); + const int yo = get_global_id(1); + const int batch = get_global_id(2); // batch size + + // Calculate input indices + const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X; + const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y; + + // Get input and output address + __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w; + __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w; + + int i = 0; + for(int yk = 0; yk < KERNEL_HEIGHT; ++yk) + { + // Clamp yi_coord + int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP; + yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1)); + + // Out-of-bound condition for Y + int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT); + + for(int xk = 0; xk < KERNEL_WIDTH; ++xk) + { + // Clamp xi_coord + int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT); + xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1)); + + // Out-of-bound condition for X + int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH); + + int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z); + + VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset)); + +#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + // Replace with PAD_VALUE if the value is out-of-bound + values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition))); +#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0 + + // Store in a boundary-aware way to avoid padding +#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE + const bool at_channel_boundary = get_global_id(0) == 0; + if(at_channel_boundary) + { + VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); + } + else // at_channel_boundary +#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE + { + VSTORE(VECTOR_SIZE) + (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH); + } + i++; + } + } + +#ifdef HAS_BIAS + // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is + // added at the end of the channel, while the boundary vec is at the beginning of the channel. + // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in + // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE + // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp + if((ch + VECTOR_SIZE) >= SRC_DEPTH) + { + *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f; + } +#endif // HAS_BIAS +} +#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) +#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl index 4569208824..7e35e161c8 100644 --- a/src/core/CL/cl_kernels/normalization_layer.cl +++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl @@ -30,69 +30,6 @@ #define POW_OP(x, y) pow((x), (y)) #define SQCVT_SAT(a) (a) -#if defined(NUM_SLICES) -/** Apply cross-map normalization. - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 - * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 - * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192 - * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0; - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA); - - const int current_slice = get_global_id(2); - const int left_slice = max(-(int)RADIUS, -current_slice); - const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice); - - for(int i = left_slice; i <= right_slice; i++) - { - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i)); - acc = ADD_OP(acc, MUL_OP(values, values)); - } - - acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized = POW_OP(acc, beta_v); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); - - VSTORE(VEC_SIZE) - (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); -} -#endif /* defined(NUM_SLICES) */ - #if defined(WIDTH_SIZE) /** Apply cross-map normalization. * @@ -156,85 +93,6 @@ __kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input), STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } - -/** Apply in-map normalization when tensors are in the NCHW data layout format. - * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16 - * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5 - * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA - * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1 - * - * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - acc = 0; - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - coeff_v = SQCVT_SAT(COEFF); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - beta_v = SQCVT_SAT(BETA); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - kappa_v = SQCVT_SAT(KAPPA); - - const int current_col = get_global_id(0) << 2; - const int left_pos = max(-(int)RADIUS, -3 - current_col); - const int right_pos = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col); - -#if defined(IN_MAP_2D) - const int current_row = get_global_id(1); - const int first_row = max(-(int)RADIUS, -current_row); - const int last_row = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row); -#endif /* defined(IN_MAP_2D) */ - -#if defined(IN_MAP_2D) - for(int j = first_row; j <= last_row; ++j) - { -#endif /* defined(IN_MAP_2D) */ - for(int i = left_pos; i <= right_pos; ++i) - { -#if defined(IN_MAP_2D) - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0)); -#else /* defined(IN_MAP_2D) */ - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0)); -#endif /* defined(IN_MAP_2D) */ - acc = ADD_OP(acc, MUL_OP(values, values)); - } -#if defined(IN_MAP_2D) - } -#endif /* defined(IN_MAP_2D) */ - - acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized = POW_OP(acc, beta_v); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized); - - VSTORE(VEC_SIZE) - (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr); -} #endif // defined(WIDTH_SIZE) #if defined(NUM_SLICES) && defined(DIM1_SIZE) @@ -267,9 +125,9 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input), TENSOR3D_DECLARATION(output)) { // Offset computation - const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); - const int current_cols = get_global_id(1); - const int current_rows = get_global_id(2); + const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + const int current_cols = get_global_id(1); + const int current_rows = get_global_id(2); // Address computation __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE); @@ -284,8 +142,8 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input), const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) kappa_v = SQCVT_SAT(KAPPA); - const int first_col = max(0, current_cols - (int)RADIUS); - const int last_col = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS); + const int first_col = max(0, current_cols - (int)RADIUS); + const int last_col = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS); #if defined(IN_MAP_2D) const int first_row = max(0, current_rows - (int)RADIUS); @@ -312,7 +170,7 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input), const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) normalized = POW_OP(acc, beta_v); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows * output_stride_z)), normalized); + normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized); STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl index 0a098356b4..86c33499e2 100644 --- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl +++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl @@ -27,59 +27,6 @@ #define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) -/** Apply normalize_planar_yuv layer on tensors with NCHW data layout. - * - * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 - * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8 - * - * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32 - * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr - * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) - * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor - * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr - * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) - * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor - */ -__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - VECTOR_DECLARATION(mean), - VECTOR_DECLARATION(std)) -{ - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); - Vector std = CONVERT_TO_VECTOR_STRUCT(std); - - const uint current_slice = get_global_id(2) % NUM_CHANNELS; - - const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))); - const DATA_TYPE curr_std = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))); - - TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); - TYPE res = (data - curr_mean) / curr_std; - - VSTORE(VEC_SIZE) - (res, 0, (__global DATA_TYPE *)dst.ptr); -} - /** Apply normalize_planar_yuv layer on tensors with NHWC data layout. * * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float @@ -131,4 +78,4 @@ __kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src), STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } -#endif // defined(DATA_TYPE) && defined(VEC_SIZE) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl index d660fffb58..7bc3c15a63 100644 --- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl +++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl @@ -29,76 +29,6 @@ #define OFFSET_FLT ((float)OFFSET) #define SCALE_FLT ((float)SCALE) -#if defined(NUM_CHANNELS) - -/** Apply normalize_planar_yuv layer on tensors with NCHW data layout. - * - * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float - * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8 - * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8 - * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8 - * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8 - * - * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED - * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes) - * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes) - * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes) - * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr - * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes) - * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor - * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr - * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes) - * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor - */ -__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst), - VECTOR_DECLARATION(mean), - VECTOR_DECLARATION(std)) -{ - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - Vector mean = CONVERT_TO_VECTOR_STRUCT(mean); - Vector std = CONVERT_TO_VECTOR_STRUCT(std); - - const uint current_slice = get_global_id(2) % NUM_CHANNELS; - - VEC_DATA_TYPE(float, VEC_SIZE) - curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)))); - curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT; - - VEC_DATA_TYPE(float, VEC_SIZE) - curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)))); - curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT; - - VEC_DATA_TYPE(float, VEC_SIZE) - data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE)); - data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT; - - // Perform normalization - VEC_DATA_TYPE(float, VEC_SIZE) - res_flt = (data_flt - curr_mean_flt) / curr_std_flt; - - const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE); - VSTORE(VEC_SIZE) - (res_u8, 0, (__global DATA_TYPE *)dst.ptr); -} - -#endif // defined(NUM_CHANNELS) - /** Apply normalize_planar_yuv layer on tensors with NHWC data layout. * * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float @@ -163,4 +93,4 @@ __kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src), const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE); STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0); } -#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) +#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl new file mode 100644 index 0000000000..5b59ff5088 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl @@ -0,0 +1,364 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "repeat.h" +#include "tile_helpers.h" + +#if defined(POOL_AVG) || defined(POOL_L2) +#define POOL_OP(x, y) ((x) + (y)) +#else /* defined(POOL_AVG) || defined(POOL_L2) */ +#define POOL_OP(x, y) (fmax((x), (y))) +#endif /* defined(POOL_AVG) || defined(POOL_L2) */ + +#if defined(POOL_L2) +#define POW2_OP(x, vec_size) ((x) * (x)) +#else /* defined(POOL_L2) */ +#define POW2_OP(x, vec_size) (x) +#endif /* defined(POOL_L2) */ + +#define DIV_OP(x, y) (x * (1.f / y)) +#define SQRT_OP(x) sqrt((x)) + +#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) + +#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) +/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types: + * -# max, -DPOOL_MAX must be passed at compile time + * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time + * -# l2 normalisation, -DPOOL_L2 must be passed at compile time + * + * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16 + * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float + * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result + * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4 + * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT + * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y + * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void pooling_layer_MxN_nhwc( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output)) +{ + // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 + // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side + int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); + int idx_out_w = GET_SPATIAL_IDX(1, 1, 0); +#if DST_BATCH_SIZE != 1 + // If batch size != 1, the batch size dimension is collapsed over the height dimension + int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT; + int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT; +#else //DST_BATCH_SIZE != 1 + int idx_out_h = GET_SPATIAL_IDX(2, 1, 0); + int idx_out_n = 0; +#endif // DST_BATCH_SIZE != 1 + + __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w; + + __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * + output_stride_w; + + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + res0 = INITIAL_VALUE; + + int idx_in_w = idx_out_w * STRIDE_X - PAD_X; + int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; + + int pool_x_s = max((int)0, -idx_in_w); + int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); + int pool_y_s = max((int)0, -idx_in_h); + int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); + +#if defined(EXCLUDE_PADDING) + int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); +#else // defined(EXCLUDE_PADDING) + int filter_size = POOL_SIZE_X * POOL_SIZE_Y; +#endif // defined(EXCLUDE_PADDING) + +#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 + // Global pooling path + for(int y = 0; y < POOL_SIZE_Y; ++y) + { +#pragma unroll 8 + for(int x = 0; x < POOL_SIZE_X; ++x) + { +#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 + for(int y = pool_y_s; y < pool_y_e; ++y) + { +#pragma unroll 8 + for(int x = pool_x_s; x < pool_x_e; ++x) + { +#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + data0; +#if defined(FP_MIXED_PRECISION) + // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE + data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); +#else // defined(FP_MIXED_PRECISION) + data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)); +#endif // defined(FP_MIXED_PRECISION) + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; +#endif // defined(POOL_L2) + res0 = POOL_OP(res0, data0); + } + } + +#if defined(POOL_AVG) || defined(POOL_L2) + res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; +#endif // defined(POOL_AVG) || defined(POOL_L2) + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res0 = SQRT_OP(res0); +#endif // defined(POOL_L2) + + // Store result +#if defined(FP_MIXED_PRECISION) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#else // defined(FP_MIXED_PRECISION) + STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#endif // defined(FP_MIXED_PRECISION) +} +#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) + +#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + +/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types: + * -# max, -DPOOL_MAX must be passed at compile time + * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time + * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time + * -# l2 normalisation, -DPOOL_L2 must be passed at compile time + * + * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16 + * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float + * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result + * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT + * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE + * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions + * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y + * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE + * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] indices_ptr (Optional) Pointer to the indices tensor. Supported data types: U32 + * @param[in] indices_stride_x (Optional) Stride of the indices tensor in X dimension (in bytes) + * @param[in] indices_step_x (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] indices_stride_y (Optional) Stride of the indices tensor in Y dimension (in bytes) + * @param[in] indices_step_y (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] indices_stride_z (Optional) Stride of the indices tensor in Z dimension (in bytes) + * @param[in] indices_step_z (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] indices_stride_w (Optional) Stride of the indices tensor in W dimension (in bytes) + * @param[in] indices_step_w (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor + */ +__kernel void pooling_layer_2x2_nhwc( + TENSOR4D_DECLARATION(input), + TENSOR4D_DECLARATION(output) +#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) + , + TENSOR4D_DECLARATION(indices) +#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) +) +{ + // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 + // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side + int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); + int idx_out_w = get_global_id(1); +#if DST_BATCH_SIZE != 1 + // If batch size != 1, the batch size dimension is collapsed over the height dimension + int idx_out_h = get_global_id(2) % DST_HEIGHT; + int idx_out_n = get_global_id(2) / DST_HEIGHT; +#else //SRC_BATCH_SIZE != 1 + int idx_out_h = get_global_id(2); + int idx_out_n = 0; +#endif // SRC_BATCH_SIZE != 1 + + int idx_in_w = idx_out_w * STRIDE_X - PAD_X; + int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; + + __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w; + + __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * + output_stride_w; + + int pool_x_s = max((int)0, -idx_in_w); + int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w); + int pool_y_s = max((int)0, -idx_in_h); + int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h); + + int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s); + + int x0 = pool_x_s + idx_in_w; + int y0 = pool_y_s + idx_in_h; + int x1 = pool_x_e - 1 + idx_in_w; + int y1 = pool_y_e - 1 + idx_in_h; + + REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0); + +#if defined(FP_MIXED_PRECISION) + // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE + data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); + data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); +#else // defined(FP_MIXED_PRECISION) + data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)); + data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)); + data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)); + data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)); +#endif // defined(FP_MIXED_PRECISION) + +#if !defined(POOL_MAX) + if(filter_size != 4) + { + SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0; + SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1); + SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0; + SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1); + + // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound) + data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s)); + data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s)); + data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e)); + data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e)); + } +#endif // !defined(POOL_MAX) + +#if defined(POOL_L2) + // Raise to power of 2 for L2 Pooling + data0 *= data0; + data1 *= data1; + data2 *= data2; + data3 *= data3; +#endif /* defined(POOL_L2) */ + + VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) + res0 = data0; + res0 = POOL_OP(res0, data1); + res0 = POOL_OP(res0, data2); + res0 = POOL_OP(res0, data3); + +#if defined(POOL_AVG) || defined(POOL_L2) +#if defined(EXCLUDE_PADDING) + res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; +#else // !defined(EXCLUDE_PADDING) + res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4; +#endif // defined(EXCLUDE_PADDING) +#endif // defined(POOL_AVG) || defined(POOL_L2) + +#if defined(POOL_L2) + // Take square root of the result in L2 pooling + res0 = SQRT_OP(res0); +#endif // defined(POOL_L2) + + // Store result +#if defined(FP_MIXED_PRECISION) + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); + STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#else // defined(FP_MIXED_PRECISION) + STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); +#endif // defined(FP_MIXED_PRECISION) + +#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) + + // This part is used to return the index of the maximum value + // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor + + // note: Batch dimension does not contribute in the offset contribution + VEC_DATA_TYPE(uint, VEC_SIZE) + base_index = (uint)idx_out_c; + + base_index += VEC_OFFS(uint, VEC_SIZE); + + VEC_DATA_TYPE(uint, VEC_SIZE) + index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH); + VEC_DATA_TYPE(uint, VEC_SIZE) + index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH); + VEC_DATA_TYPE(uint, VEC_SIZE) + index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH); + VEC_DATA_TYPE(uint, VEC_SIZE) + index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH); + + index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE))); + index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE))); + index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE))); + + __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n * + indices_stride_w; + + // Store result + STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0)); +#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) +} +#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl index d8cef2b4e6..46268a4a88 100644 --- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl +++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017-2020 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -51,108 +51,6 @@ #error "L2 pooling is not supported" #endif /* defined(POOL_L2) */ -int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) -{ - int start_x = get_global_id(0) * stride_x - pad_x; - int start_y = get_global_id(1) * stride_y - pad_y; - const int end_x = min(start_x + pool_size_x, upper_bound_w); - const int end_y = min(start_y + pool_size_y, upper_bound_h); -#if defined(EXCLUDE_PADDING) - start_x = max(0, start_x); - start_y = max(0, start_y); -#endif /* defined(EXCLUDE_PADDING) */ - return ((end_y - start_y) * (end_x - start_x)); -} - -/** Performs a pooling function of pool size equal to N (NCHW) - * - * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; - * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG must be provided otherwise max pooling will be performed. - * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension - * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar - * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 - * - * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED - * @param[in] input_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void pooling_layer_MxN_quantized_nchw( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - int8 vdata = INITIAL_VALUE; - int sdata = INITIAL_VALUE; - - // Load data - for(int y = 0; y < POOL_SIZE_Y; y++) - { - int x = 0; - for(; x <= ((int)POOL_SIZE_X - 8); x += 8) - { - VEC_TYPE(8) - data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); - int8 data0 = convert_int8(data); - vdata = POOL_OP(vdata, data0); - } - - // Leftover - for(; x < (int)POOL_SIZE_X; ++x) - { - DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); - int data0 = convert_int(data); - sdata = POOL_OP(sdata, data0); - } - } - - // Reduce result - int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567); - int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23); - int res = POOL_OP(reduce2.s0, reduce2.s1); - res = POOL_OP(res, sdata); - -#if defined(POOL_AVG) - res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y))); -#endif /* defined(POOL_AVG) */ - - DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE); - -#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) - - const float result_f32 = convert_float(result_q8); - const float input_offset = (float)OFFSET_IN1; - const float input_scale = (float)SCALE_IN1; - const float scale_out = (float)SCALE_OUT; - const float offset_out = (float)OFFSET_OUT; - const float in_f32 = (result_f32 - input_offset) * input_scale; - const float out_f32 = in_f32 / scale_out + offset_out; - result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE); - -#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ - - *(__global DATA_TYPE *)output.ptr = result_q8; -} - #if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) /** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types: * -# max, -DPOOL_MAX must be passed at compile time diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/nhwc/remap.cl index cb67c2df1e..0b629fe6c9 100644 --- a/src/core/CL/cl_kernels/remap.cl +++ b/src/core/CL/cl_kernels/nhwc/remap.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2017, 2021 Arm Limited. + * Copyright (c) 2017-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,113 +24,7 @@ #include "helpers.h" #include "warp_helpers.h" -#ifndef DEPTH_OUT -/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation. - * - * This kernel performs remapping with this method of pixel coordinate translation: - * out(x,y) = in(mapx(x,y), mapy(x,y)); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. - * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) - * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) - * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image - * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. - * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) - * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) - * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image - * @param[in] width Width of the input image - * @param[in] height Height of the input image - */ -__kernel void remap_nearest_neighbour_nchw( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - IMAGE_DECLARATION(mapx), - IMAGE_DECLARATION(mapy), - const float width, - const float height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); - Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); - - float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); - float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); - float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, - mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); - - vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr); -} - -/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation. - * - * This kernel performs remapping with this method of pixel coordinate translation: - * out(x,y) = in(mapx(x,y), mapy(x,y)); - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8. - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image - * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32. - * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes) - * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes) - * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image - * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32. - * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes) - * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes) - * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes) - * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes) - * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image - * @param[in] width Width of the input image - * @param[in] height Height of the input image - */ -__kernel void remap_bilinear_nchw( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - IMAGE_DECLARATION(mapx), - IMAGE_DECLARATION(mapy), - const float width, - const float height) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx); - Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy); - - float4 mapx_coords = vload4(0, (__global float *)mapx.ptr); - float4 mapy_coords = vload4(0, (__global float *)mapy.ptr); - float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1, - mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3); - - vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr); -} -#else // DEPTH_OUT +#ifdef DEPTH_OUT /** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation. * Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set. * @@ -283,4 +177,4 @@ __kernel void remap_bilinear_nhwc( *((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE); } -#endif // DEPTH_OUT +#endif // DEPTH_OUT
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/reorg_layer.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl new file mode 100644 index 0000000000..a340b0b8a2 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl @@ -0,0 +1,76 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE) + +#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi) \ + ({ \ + int offset = zo / (int)SRC_DEPTH; \ + xi = xo * (int)STRIDE + offset % (int)STRIDE; \ + yi = yo * (int)STRIDE + offset / (int)STRIDE; \ + zi = zo % SRC_DEPTH; \ + }) + +/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC + * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64 + * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: All + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void reorg_layer_nhwc( + TENSOR3D_DECLARATION(src), + TENSOR3D_DECLARATION(dst)) +{ + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst); + + int xo = get_global_id(1); + int yo = get_global_id(2); + int zo = get_global_id(0); + int xi, yi, zi; + + CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi); + + int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset)); +} +#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl index d4c27e6cf6..1ea5e73df1 100644 --- a/src/core/CL/cl_kernels/scale.cl +++ b/src/core/CL/cl_kernels/nhwc/scale.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2016-2020 Arm Limited. + * Copyright (c) 2016-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,129 +24,6 @@ #include "helpers.h" #include "warp_helpers.h" -/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. - * - * @param[in] coord 2D coordinates to transform. - * @param[in] scale input/output scale ratio - * - * @return a float8 containing 4 2D transformed values in the input image. - */ -inline const float8 transform_nearest(const float2 coord, const float2 scale) -{ -#ifdef SAMPLING_POLICY_TOP_LEFT - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); - const float4 new_x = in_x_coords * (float4)(scale.s0); - const float4 new_y = (float4)(coord.s1 * scale.s1); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#elif SAMPLING_POLICY_CENTER - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); - const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0); - const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#else /* SAMPLING_POLICY */ -#error("Unsupported sampling policy"); -#endif /* SAMPLING_POLICY */ -} - -/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. - * - * @param[in] coord 2D coordinates to transform. - * @param[in] scale input/output scale ratio - * - * @return a float8 containing 4 2D transformed values in the input image. - */ -inline const float8 transform_bilinear(const float2 coord, const float2 scale) -{ - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); -#ifdef SAMPLING_POLICY_TOP_LEFT - const float4 new_x = in_x_coords * (float4)(scale.s0); - const float4 new_y = (float4)(coord.s1 * scale.s1); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#elif SAMPLING_POLICY_CENTER - const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); - const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#else /* SAMPLING_POLICY */ -#error("Unsupported sampling policy"); -#endif /* SAMPLING_POLICY */ -} - -/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16. - * - * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension - */ -__kernel void scale_nearest_neighbour_nchw( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - const float2 r = (float2)(scale_x, scale_y); - float8 transformed = transform_nearest(get_current_coords(), r); -#ifdef ALIGN_CORNERS - transformed = round(transformed); -#endif // ALIGN_CORNERS - const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE); - vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr); -} - -/** Performs an affine transformation on an image interpolating with the BILINEAR method. - * - * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * - * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension - */ -__kernel void scale_bilinear_nchw( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - const float2 r = (float2)(scale_x, scale_y); - const float8 tc = transform_bilinear(get_current_coords(), r); - vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr); -} - #if defined(DEPTH_OUT) /** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC) * @@ -262,22 +139,22 @@ __kernel void scale_bilinear_nhwc( const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); #ifndef BORDER_MODE_REPLICATE - const bool check_x = (0.f <= new_xf && new_xf < input_width); + const bool check_x = (0.f <= new_xf && new_xf < input_width); const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); - const bool check_y = (0.f <= new_yf && new_yf < input_height); + const bool check_y = (0.f <= new_yf && new_yf < input_height); const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); - const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y); + const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), + check_x && check_y); const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y); + check_x1 && check_y); const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y1); + check_x && check_y1); const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y1); + check_x1 && check_y1); float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3); #else /* BORDER_MODE_REPLICATE */ float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl index 010e4ed57a..de9bb607b0 100644 --- a/src/core/CL/cl_kernels/scale_quantized.cl +++ b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -24,67 +24,6 @@ #include "helpers_asymm.h" #include "warp_helpers_quantized.h" -/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates. - * - * @param[in] coord 2D coordinates to transform. - * @param[in] scale input/output scale ratio - * - * @return a float8 containing 4 2D transformed values in the input image. - */ -inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale) -{ - const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0); -#ifdef SAMPLING_POLICY_TOP_LEFT - const float4 new_x = in_x_coords * (float4)(scale.s0); - const float4 new_y = (float4)(coord.s1 * scale.s1); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#elif SAMPLING_POLICY_CENTER - const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f); - const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f); - return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3); -#else /* SAMPLING_POLICY */ -#error("Unsupported sampling policy"); -#endif /* SAMPLING_POLICY */ -} - -/** Performs an affine transformation on an image interpolating with the BILINEAR method. - * - * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT - * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5 - * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1 - * - * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8. - * @param[in] in_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input) - * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] input_width Input image width - * @param[in] input_height Input image height - * @param[in] scale_x The scale factor along x dimension - * @param[in] scale_y The scale factor along y dimension - */ -__kernel void scale_bilinear_quantized_nchw( - IMAGE_DECLARATION(in), - IMAGE_DECLARATION(out), - const float input_width, - const float input_height, - const float scale_x, - const float scale_y) -{ - Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in); - Image out = CONVERT_TO_IMAGE_STRUCT(out); - const float2 r = (float2)(scale_x, scale_y); - const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r); - vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr); -} - #if defined(DEPTH_OUT) /** Performs scale on an image interpolating with the BILINEAR method. (NHWC) * @@ -146,22 +85,22 @@ __kernel void scale_bilinear_quantized_nhwc( const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1); #ifndef BORDER_MODE_REPLICATE - const bool check_x = (0.f <= new_xf && new_xf < input_width); - const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); - const bool check_y = (0.f <= new_yf && new_yf < input_height); - const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); - const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), - (get_global_id(2) / DEPTH_OUT)))), + const bool check_x = (0.f <= new_xf && new_xf < input_width); + const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1); + const bool check_y = (0.f <= new_yf && new_yf < input_height); + const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1); + const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), + (get_global_id(2) / DEPTH_OUT)))), check_x && check_y); const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y); + check_x1 && check_y); const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))), - check_x && check_y1); + check_x && check_y1); const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1), (get_global_id(2) / DEPTH_OUT)))), - check_x1 && check_y1); + check_x1 && check_y1); int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3); #else /* BORDER_MODE_REPLICATE */ int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))), diff --git a/src/core/CL/cl_kernels/nhwc/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl new file mode 100644 index 0000000000..785206e3b9 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN) +/** Calculate the space to batch conversion. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32 + * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes) + * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes) + * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image + * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32 + * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes) + * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor + * @param[in] batch_id The output tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_batch_nhwc( + TENSOR4D_DECLARATION(input), + IMAGE_DECLARATION(paddings), + VECTOR_DECLARATION(block_shape), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings); + Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + const int pad_left_x = *((__global int *)offset(&pad, 0, 0)); + const int pad_right_x = *((__global int *)offset(&pad, 1, 0)); + const int pad_left_y = *((__global int *)offset(&pad, 0, 1)); + const int pad_right_y = *((__global int *)offset(&pad, 1, 1)); + + int block_x = *((__global int *)vector_offset(&block, 0)); + int block_y = *((__global int *)vector_offset(&block, 1)); + + const int out_x = get_global_id(1); + const int out_y = get_global_id(2); + const int z = get_global_id(0); + + const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); + const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); + + if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN))) + { + const int w = batch_id % BATCH_IN; + const int in_x = pos_x - pad_left_x; + const int in_y = pos_y - pad_left_y; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w)); + } +} +#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN) + +#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN) +/** Calculate the space to batch conversion. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2 + * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2 + * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2 + * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2 + * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2 + * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2 + * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image + * @param[in] batch_id The output tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image + */ +__kernel void space_to_batch_static_nhwc( + TENSOR4D_DECLARATION(input), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + int block_x = BLOCK_SHAPE_X; + int block_y = BLOCK_SHAPE_Y; + + const int out_x = get_global_id(1); + const int out_y = get_global_id(2); + const int z = get_global_id(0); + + const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x); + const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x); + + if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN) + { + const int w = batch_id % BATCH_IN; + const int in_x = pos_x - PAD_LEFT_X; + const int in_y = pos_y - PAD_LEFT_Y; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w)); + } +} +#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl new file mode 100644 index 0000000000..d44e78d990 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" + +#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) +/** Space to depth transformation. (NHWC) + * + * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2 + * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: All + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor + * @param[in] batch_id The input tensor batch id + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void space_to_depth_nhwc( + TENSOR4D_DECLARATION(input), + const int batch_id, + TENSOR3D_DECLARATION(output)) +{ + Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0); + Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output); + + const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE)); + const int x = get_global_id(1); + const int y = get_global_id(2); + const int z = get_global_id(0) % r; + + const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE; + const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE; + + *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id)); +} +#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl index d0cc0f24b7..74b9674a88 100644 --- a/src/core/CL/cl_kernels/upsample_layer.cl +++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2020 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -23,61 +23,6 @@ */ #include "helpers.h" -/** This function applies upsample on an input image. (NCHW) - * - * @attention The following variables must be passed at compile time: - * -# -DDATA_TYPE = Tensor data type. Supported data types: All - * -# -DVEC_SIZE_IN = Input vector size - * -# -DVEC_SIZE_OUT = Output vector size - * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit) - * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit) - * - * @param[in] src_ptr Pointer to the source image. Supported data types: All - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void upsample_layer_nchw( - TENSOR3D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src); - Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst); - -#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) - // Check if access on width gets out of bounds - // If it does shift access vector to access elements within bounds - const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN); - const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT); - src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x; - dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x; - - VEC_DATA_TYPE(DATA_TYPE, 8) - data = vload8(0, (__global DATA_TYPE *)src.ptr); - - VEC_DATA_TYPE(DATA_TYPE, 16) - data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7); - - vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr); - vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)); -#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) - *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr); - *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr); -#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) -} - /** This function applies upsample on an input image. (NHWC) * * @attention The following variables must be passed at compile time: @@ -132,4 +77,4 @@ __kernel void upsample_layer_nhwc( *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr); *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr); #endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT) -} +}
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl index 5c3bb8aa9b..8d5fd3437f 100644 --- a/src/core/CL/cl_kernels/winograd_filter_transform.cl +++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018-2019 Arm Limited. + * Copyright (c) 2018-2021 Arm Limited. * * SPDX-License-Identifier: MIT * @@ -37,288 +37,6 @@ out.s7 = tmp.s6; \ }) -/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_2x2_3x3_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); - - const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); - - // Load the values from the input tensor -#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) - VEC_DATA_TYPE(DATA_TYPE, 3) - w0 = vload3(0, (__global DATA_TYPE *)(src_addr)); -#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - VEC_DATA_TYPE(DATA_TYPE, 3) - w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), - *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), - *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y))); -#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - VEC_DATA_TYPE(DATA_TYPE, 3) - w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, 3) - w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, 3) - w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); -#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) - - // Row 0 - VEC_DATA_TYPE(DATA_TYPE, 4) - out0 = 0.0f; - out0.s0 = (w0.s0); - out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f; - out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f; - out0.s3 = (w0.s2); - -#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - // Row 1 - VEC_DATA_TYPE(DATA_TYPE, 4) - out1 = 0.0f; - out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f; - out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f; - out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f; - out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f; - - // Row 2 - VEC_DATA_TYPE(DATA_TYPE, 4) - out2 = 0.0f; - out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f; - out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f; - out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f; - out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f; - - // Row 3 - VEC_DATA_TYPE(DATA_TYPE, 4) - out3 = 0.0f; - out3.s0 = (w2.s0); - out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f; - out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f; - out3.s3 = (w2.s2); -#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - - int z = get_global_id(2); - int x0 = z / SRC_DIM_Z; // idx filter - int y0 = z % SRC_DIM_Z; // idx channel - - // Get output address - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; - - // Store the values across the channels - // 16 channels for 3x3 kernels - // 4 channels for 3x1 or 1x3 kernels - *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; - *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; - *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; - *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; - -#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0; - *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1; - *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2; - *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3; - *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0; - *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1; - *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2; - *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3; - *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0; - *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1; - *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2; - *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3; -#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) -} - -/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_4x4_3x3_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); - - const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); - - // Load the values from the input tensor -#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) - VEC_DATA_TYPE(DATA_TYPE, 3) - w0 = vload3(0, (__global DATA_TYPE *)(src_addr)); -#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - VEC_DATA_TYPE(DATA_TYPE, 3) - w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), - *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), - *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y))); -#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - VEC_DATA_TYPE(DATA_TYPE, 3) - w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, 3) - w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); - VEC_DATA_TYPE(DATA_TYPE, 3) - w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); -#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) - - // Row 0 - VEC_DATA_TYPE(DATA_TYPE, 8) - out0 = 0.0f; - out0.s0 = (w0.s0) / 16.f; - out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f; - out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f; - out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f; - out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f; - out0.s5 = (w0.s2) / 4.f; - -#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - // Row 1 - VEC_DATA_TYPE(DATA_TYPE, 8) - out1 = 0.0f; - out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f; - out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; - out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f; - out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; - out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f; - out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f; - - // Row 2 - VEC_DATA_TYPE(DATA_TYPE, 8) - out2 = 0.0f; - out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f; - out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; - out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f; - out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; - out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f; - out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f; - - // Row 3 - VEC_DATA_TYPE(DATA_TYPE, 8) - out3 = 0.0f; - out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f; - out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f; - - // Row 4 - VEC_DATA_TYPE(DATA_TYPE, 8) - out4 = 0.0f; - out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f; - out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f; - out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f; - out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f; - - // Row 5 - VEC_DATA_TYPE(DATA_TYPE, 8) - out5 = 0.0f; - out5.s0 = (w2.s0) / 4.f; - out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f; - out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f; - out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f; - out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f; - out5.s5 = (w2.s2); -#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - - int z = get_global_id(2); - int x0 = z / SRC_DIM_Z; // idx filter - int y0 = z % SRC_DIM_Z; // idx channel - - // Get output address - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y; - - // Store the values across the channels - // 36 channels for 3x3 kernels - // 6 channels for 3x1 or 1x3 kernels - *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; - *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; - *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; - *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; - *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; - *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; - -#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0; - *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1; - *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2; - *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3; - *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4; - *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5; - *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0; - *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1; - *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2; - *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3; - *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4; - *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5; - *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0; - *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1; - *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2; - *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3; - *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4; - *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5; - *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0; - *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1; - *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2; - *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3; - *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4; - *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5; - *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0; - *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1; - *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2; - *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3; - *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4; - *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5; -#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) -} - /** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4 * * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 @@ -477,308 +195,6 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc( #endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) } -/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * - * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time - * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_4x4_5x5_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z); - - const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0); - - // Load the values from the input tensor -#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) - VEC_DATA_TYPE(DATA_TYPE, 4) - w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); - DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4); -#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - VEC_DATA_TYPE(DATA_TYPE, 4) - w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)), - *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)), - *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)), - *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y))); - DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); -#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - VEC_DATA_TYPE(DATA_TYPE, 4) - w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y)); - DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4); - VEC_DATA_TYPE(DATA_TYPE, 4) - w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y)); - DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4); - VEC_DATA_TYPE(DATA_TYPE, 4) - w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y)); - DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4); - VEC_DATA_TYPE(DATA_TYPE, 4) - w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y)); - DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4); - VEC_DATA_TYPE(DATA_TYPE, 4) - w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y)); - DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4); -#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) - - // Transform the input tile - - // Row 0 - VEC_DATA_TYPE(DATA_TYPE, 8) - out0 = 0.0f; - out0.s0 = w00.s0; - out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f; - out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f; - out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f; - out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f; - out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f; - out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f; - out0.s7 = w01; - -#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - // Row 1 - VEC_DATA_TYPE(DATA_TYPE, 8) - out1 = 0.0f; - out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f; - out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + - (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f; - out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - - (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f; - out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f * - (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f; - out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f * - (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f; - out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f * - (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f; - out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f * - (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f; - out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f; - - // Row 2 - VEC_DATA_TYPE(DATA_TYPE, 8) - out2 = 0.0f; - out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f; - out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + - (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f; - out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - - (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f; - out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f * - (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f; - out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f * - (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f; - out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f * - (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f; - out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f * - (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f; - out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f; - - // Row 3 - VEC_DATA_TYPE(DATA_TYPE, 8) - out3 = 0.0f; - out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f; - out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + - (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f; - out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + - (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f; - out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f * - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f; - out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f * - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f; - out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f; - out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + - (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f; - out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f; - - // Row 4 - VEC_DATA_TYPE(DATA_TYPE, 8) - out4 = 0.0f; - out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f; - out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + - (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f; - out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + - (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f; - out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f * - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f; - out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f * - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f; - out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f; - out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f * - (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + - (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f; - out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f; - - // Row 5 - VEC_DATA_TYPE(DATA_TYPE, 8) - out5 = 0.0f; - out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f; - out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + - (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f; - out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + - (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f; - out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; - out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f * - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f; - out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; - out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + - (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f; - out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f; - - // Row 6 - VEC_DATA_TYPE(DATA_TYPE, 8) - out6 = 0.0f; - out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f; - out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + - (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f; - out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + - (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f; - out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; - out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f * - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f; - out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; - out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f * - (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + - (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f; - out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f; - - // Row 7 - VEC_DATA_TYPE(DATA_TYPE, 8) - out7 = 0.0f; - out7.s0 = w40.s0; - out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f; - out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f; - out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f; - out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f; - out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f; - out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f; - out7.s7 = w41; -#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - - int z = get_global_id(2); - int x0 = z / SRC_DIM_Z; // idx filter - int y0 = z % SRC_DIM_Z; // idx channel - - // Get output address - __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y; - - // Store the values across the channels - *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0; - *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1; - *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2; - *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3; - *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4; - *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5; - *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6; - *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7; - -#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) - *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0; - *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1; - *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2; - *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3; - *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4; - *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5; - *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6; - *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7; - *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0; - *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1; - *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2; - *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3; - *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4; - *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5; - *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6; - *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7; - *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0; - *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1; - *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2; - *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3; - *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4; - *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5; - *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6; - *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7; - *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0; - *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1; - *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2; - *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3; - *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4; - *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5; - *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6; - *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7; - *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0; - *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1; - *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2; - *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3; - *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4; - *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5; - *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6; - *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7; - *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0; - *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1; - *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2; - *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3; - *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4; - *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5; - *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6; - *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7; - *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0; - *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1; - *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2; - *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3; - *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4; - *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5; - *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6; - *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7; -#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) -} - /** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4 * * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 @@ -1360,152 +776,6 @@ __kernel void winograd_filter_transform_2x2_7x7_nhwc( #endif // defined(SRC_DIM_Z) #if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) -/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_2x1_3x1_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - winograd_filter_transform_2x2_3x3_nchw(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_offset_first_element_in_bytes); -} - -/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_4x1_3x1_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - winograd_filter_transform_4x4_3x3_nchw(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_offset_first_element_in_bytes); -} - -/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_4x1_5x1_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - winograd_filter_transform_4x4_5x5_nchw(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_offset_first_element_in_bytes); -} /** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1 * @@ -1656,153 +926,6 @@ __kernel void winograd_filter_transform_2x1_7x1_nhwc( #endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) #if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) -/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_1x2_1x3_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - winograd_filter_transform_2x2_3x3_nchw(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_offset_first_element_in_bytes); -} - -/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_1x4_1x3_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - winograd_filter_transform_4x4_3x3_nchw(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_offset_first_element_in_bytes); -} - -/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4 - * - * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 - * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform - * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. - * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr - * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void winograd_filter_transform_1x4_1x5_nchw( - TENSOR4D_DECLARATION(src), - TENSOR3D_DECLARATION(dst)) -{ - winograd_filter_transform_4x4_5x5_nchw(src_ptr, - src_stride_x, - src_step_x, - src_stride_y, - src_step_y, - src_stride_z, - src_step_z, - src_stride_w, - src_step_w, - src_offset_first_element_in_bytes, - dst_ptr, - dst_stride_x, - dst_step_x, - dst_stride_y, - dst_step_y, - dst_stride_z, - dst_step_z, - dst_offset_first_element_in_bytes); -} - /** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4 * * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64 @@ -1949,4 +1072,4 @@ __kernel void winograd_filter_transform_1x2_1x7_nhwc( dst_step_z, dst_offset_first_element_in_bytes); } -#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) +#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl new file mode 100644 index 0000000000..4865982a55 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl @@ -0,0 +1,953 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "helpers.h" +#include "tile_helpers.h" + +#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \ + ({ \ + comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \ + comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \ + comm_fact.s2 = 2.5f * tmp.s3; \ + comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \ + comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \ + comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \ + comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \ + \ + out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \ + out.s1 = comm_fact.s0 + comm_fact.s1; \ + out.s2 = comm_fact.s0 - comm_fact.s1; \ + out.s3 = comm_fact.s3 + comm_fact.s4; \ + out.s4 = comm_fact.s4 - comm_fact.s3; \ + out.s5 = comm_fact.s5 + comm_fact.s6; \ + out.s6 = comm_fact.s5 - comm_fact.s6; \ + out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \ + }) + +#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \ + ({ \ + comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \ + comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \ + comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \ + comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \ + comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \ + comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \ + out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \ + out.s1 = comm_fact.s0 - comm_fact.s1; \ + out.s2 = comm_fact.s0 + comm_fact.s1; \ + out.s3 = comm_fact.s2 - comm_fact.s3; \ + out.s4 = comm_fact.s2 + comm_fact.s3; \ + out.s5 = comm_fact.s4 - comm_fact.s5; \ + out.s6 = comm_fact.s4 + comm_fact.s5; \ + out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \ + }) + +#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) + +#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y) +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _INUM_TILES_X NUM_TILES_X +#define _INUM_TILES_Y NUM_TILES_Y + + int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + x -= PAD_LEFT; + y -= PAD_TOP; + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 6, 1, in); + TILE(DATA_TYPE, 6, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 6, + { + in[i].v = 0; + }) + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + + TILE(DATA_TYPE, 6, 1, com); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + in[i].v *= 4.0f; + }) + + com[0].v = in[2].v - 4.f * in[0].v; + com[1].v = in[3].v - 4.f * in[1].v; + com[2].v = in[4].v - 4.f * in[2].v; + com[3].v = in[5].v - 4.f * in[3].v; + com[4].v = in[3].v - in[1].v; + com[4].v = com[4].v + com[4].v; + com[5].v = in[4].v - in[2].v; + + out[0].v = com[2].v - com[0].v; + out[1].v = com[2].v + com[1].v; + out[2].v = com[2].v - com[1].v; + out[3].v = com[5].v + com[4].v; + out[4].v = com[5].v - com[4].v; + out[5].v = com[3].v - com[1].v; + + TILE(uint, 6, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 36, 1, in); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 36, + { + in[i].v = 0; + }) + + // Load the tile from a NHWC tensor + T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); + + TILE(DATA_TYPE, 6, 1, com); + TILE(DATA_TYPE, 36, 1, tmp); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v; + com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v; + com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v; + com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v; + com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v; + com[4].v = com[4].v + com[4].v; + com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v; + tmp[i + 0 * 6].v = com[2].v - com[0].v; + tmp[i + 1 * 6].v = com[2].v + com[1].v; + tmp[i + 2 * 6].v = com[2].v - com[1].v; + tmp[i + 3 * 6].v = com[5].v + com[4].v; + tmp[i + 4 * 6].v = com[5].v - com[4].v; + tmp[i + 5 * 6].v = com[3].v - com[1].v; + }) + + TILE(DATA_TYPE, 36, 1, out); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + com[0].v = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v; + com[1].v = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v; + com[2].v = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v; + com[3].v = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v; + com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v; + com[4].v = com[4].v + com[4].v; + com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v; + out[i * 6 + 0].v = com[2].v - com[0].v; + out[i * 6 + 1].v = com[2].v + com[1].v; + out[i * 6 + 2].v = com[2].v - com[1].v; + out[i * 6 + 3].v = com[5].v + com[4].v; + out[i * 6 + 4].v = com[5].v - com[4].v; + out[i * 6 + 5].v = com[3].v - com[1].v; + }) + + // Compute destination address + TILE(uint, 36, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 36, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _INUM_TILES_X NUM_TILES_X +#define _INUM_TILES_Y NUM_TILES_Y + + int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + x -= PAD_LEFT; + y -= PAD_TOP; + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 8, 1, in); + TILE(DATA_TYPE, 8, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + + TILE(DATA_TYPE, 1, 8, com); + + com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v; + com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v; + com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v; + com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v; + com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; + com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v; + out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v; + out[1].s[0] = com[0].s[0] + com[0].s[1]; + out[2].s[0] = com[0].s[0] - com[0].s[1]; + out[3].s[0] = com[0].s[3] + com[0].s[2]; + out[4].s[0] = com[0].s[3] - com[0].s[2]; + out[5].s[0] = com[0].s[4] + com[0].s[5]; + out[6].s[0] = com[0].s[4] - com[0].s[5]; + out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v; + + TILE(uint, 8, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 64, 1, in); + TILE(DATA_TYPE, 64, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // Load the tile from a NHWC tensor + T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); + + TILE(DATA_TYPE, 8, 8, com); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x + com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x + com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x + com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x + com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0]; + com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0]; + com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0]; + }) + + TILE(DATA_TYPE, 8, 8, tmp); + tmp[0].v = com[6].v; + tmp[1].v = com[0].v + com[1].v; + tmp[2].v = com[0].v - com[1].v; + tmp[3].v = com[2].v + com[3].v; + tmp[4].v = com[2].v - com[3].v; + tmp[5].v = com[4].v + com[5].v; + tmp[6].v = com[4].v - com[5].v; + tmp[7].v = com[7].v; + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5]; + com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; + com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5]; + out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6]; + out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1]; + out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1]; + out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2]; + out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2]; + out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5]; + out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5]; + out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7]; + }) + + TILE(uint, 64, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 64, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + // All the tensor dimensions are passed at compile time. + // In case of dynamic tensor support, the following dimensions should be passed as function argument. +#define _ISRC_WIDTH SRC_WIDTH +#define _ISRC_HEIGHT SRC_HEIGHT +#define _INUM_TILES_X NUM_TILES_X +#define _INUM_TILES_Y NUM_TILES_Y + + int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + x -= PAD_LEFT; + y -= PAD_TOP; + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 8, 1, in); + TILE(DATA_TYPE, 8, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + +#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v *= (DATA_TYPE) - 36.0f; + }) + + TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } }; + + com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v; + com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v; + com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v; + com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v; + com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v; + com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v; + out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v; + out[1].s[0] = com[0].s[0] - com[0].s[1]; + out[2].s[0] = com[0].s[0] + com[0].s[1]; + out[3].s[0] = com[0].s[2] - com[0].s[3]; + out[4].s[0] = com[0].s[2] + com[0].s[3]; + out[5].s[0] = com[0].s[4] - com[0].s[5]; + out[6].s[0] = com[0].s[4] + com[0].s[5]; + out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v; + + TILE(uint, 8, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 64, 1, in); + TILE(DATA_TYPE, 64, 1, out); + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // Load the tile from a NHWC tensor + T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in); + + TILE(DATA_TYPE, 8, 8, com); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; + com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; + com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; + com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0]; + com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0]; + com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0]; + }) + + TILE(DATA_TYPE, 8, 8, tmp); + tmp[0].v = com[6].v; + tmp[1].v = com[0].v - com[1].v; + tmp[2].v = com[0].v + com[1].v; + tmp[3].v = com[2].v - com[3].v; + tmp[4].v = com[2].v + com[3].v; + tmp[5].v = com[4].v - com[5].v; + tmp[6].v = com[4].v + com[5].v; + tmp[7].v = com[7].v; + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5]; + com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5]; + com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6]; + com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5]; + out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6]; + out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1]; + out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1]; + out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3]; + out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3]; + out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5]; + out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5]; + out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7]; + }) + + TILE(uint, 64, 1, dst_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 64, + { + dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y; + dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64; + }) + + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL) +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} + +//! @cond Doxygen_Suppress +/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC + * + * @note Data layout supported: NHWC + * @note Data type supported: F32/F16 + * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half) + * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3). + * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2) + * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64) + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time + * + * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +//! @endcond +__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER)) +{ + winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes); +} +#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y) +#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl new file mode 100644 index 0000000000..0fcd04e713 --- /dev/null +++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl @@ -0,0 +1,1030 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "activation_float_helpers.h" +#include "helpers.h" +#include "tile_helpers.h" + +#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_2x2_7x7_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ +#define _ISRC_HEIGHT SRC_HEIGHT +#define _IDST_WIDTH DST_WIDTH +#define _IDST_HEIGHT DST_HEIGHT +#define _INUM_TILES_X NUM_TILES_X + + const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + + int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H; + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + TILE(DATA_TYPE, 8, N0, in); + TILE(DATA_TYPE, 2, N0, out); + TILE(uint, 8, 1, src_indirect_y); + + // Calculate the indirect Y for the source tensor + LOOP_UNROLLING(int, i, 0, 1, 8, + { + src_indirect_y[i].v = mout + i *_ISRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + + // Load the values across the 8 channels to compose the 8x1 tile + T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // Compute out0 and out01 + out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v; + out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v; + +#if defined(HAS_BIAS) + // Add bias + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out); +#endif // defined(HAS_BIAS) + + T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 2, 1, dst_indirect_y); + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, yk, 0, 1, 2, + { + int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); + dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH); + }) +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, xk, 0, 1, 2, + { + int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); + dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH); + }) +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 64, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(DATA_TYPE, 16, N0, tmp); + TILE(uint, 64, 1, src_indirect_y); + + // Calculate the indirect Y for the source tensor + LOOP_UNROLLING(int, i, 0, 1, 64, + { + src_indirect_y[i].v = mout + i *_ISRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // Load the values across the 64 channels to compose the 8x8 tile + T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v; + tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v; + }) + + // Compute the 2x2 output tile + LOOP_UNROLLING(int, i, 0, 1, 2, + { + out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v; + out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v; + }) + +#if defined(HAS_BIAS) + // Add bias + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); +#endif // defined(HAS_BIAS) + + T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 4, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, yk, 0, 1, 2, + { + LOOP_UNROLLING(int, xk, 0, 1, 2, + { + int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1)); + int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1)); + dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH; + dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT); + }) + }) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] dst_size Size of the destination tensor, minus the last padding + */ +__kernel void winograd_output_transform_4x4_3x3_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + TILE(DATA_TYPE, 6, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(uint, 6, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 6, + { + in[i].v = 0; + }) + + // Load the values across the 36 channels to compose the 6x6 or 6x1 tile + T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // Compute out00, out01, out02 and out03 + out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v; + out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v; + out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v; + out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v; + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 4, 1, dst_indirect_y); + + // Calculate the destination indirect Y +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH; + dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH; + dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Calculate the indirect Y for the source tensor + TILE(DATA_TYPE, 36, N0, in); + TILE(DATA_TYPE, 4, N0, tmp); + TILE(uint, 36, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 36, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 36, + { + in[i].v = 0; + }) + + // Load the values across the 36 channels to compose the 6x6 or 6x1 tile + T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + LOOP_UNROLLING(int, i, 0, 1, 6, + { + tmp[0].v = in[6 + i].v + in[12 + i].v; + tmp[1].v = in[6 + i].v - in[12 + i].v; + tmp[2].v = in[18 + i].v + in[24 + i].v; + tmp[3].v = in[18 + i].v - in[24 + i].v; + tmp[3].v = tmp[3].v + tmp[3].v; + in[i].v = in[i].v + tmp[0].v + tmp[2].v; + in[6 + i].v = tmp[3].v + tmp[1].v; + in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); + in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v; + }) + + // Compute the output tile + TILE(DATA_TYPE, 16, N0, out); + + LOOP_UNROLLING(int, i, 0, 1, 4, + { + tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v; + tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v; + tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v; + tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v; + tmp[3].v = tmp[3].v + tmp[3].v; + out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v; + out[4 * i + 1].v = tmp[3].v + tmp[1].v; + out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v); + out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v; + }) + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 16, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH; + dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) + }) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1 + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x4_5x5_nhwc( + TENSOR4D(src, BUFFER), + TENSOR4D(dst, BUFFER), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM + const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES + const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + TILE(DATA_TYPE, 8, N0, in); + TILE(DATA_TYPE, 4, N0, out); + TILE(DATA_TYPE, 4, N0, tmp); + TILE(uint, 8, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 8, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 8, + { + in[i].v = 0; + }) + + // "in" contains 1x8 or 8x1 tile here + T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // A^T * in, and in this degenerate case out consists of 1 column/row + tmp[0].v = in[1].v - in[2].v; + tmp[1].v = 2.0f * (in[3].v - in[4].v); + tmp[2].v = 2.0f * (in[5].v + in[6].v); + tmp[3].v = in[3].v + in[4].v; + out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v; + out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v); + out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v; + out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v; + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 4, 1, dst_indirect_y); + + // Calculate the destination indirect Y +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH; + dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH; + dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); + +#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) + // Calculate the indirect Y for the source tensor + TILE(DATA_TYPE, 64, N0, in); + TILE(DATA_TYPE, 6, N0, tmp); + TILE(uint, 64, 1, src_indirect_y); + + LOOP_UNROLLING(int, i, 0, 1, 64, + { + src_indirect_y[i].v = mout + i *SRC_HEIGHT; + src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64); + }) + + // Initialize the input tile + LOOP_UNROLLING(int, i, 0, 1, 64, + { + in[i].v = 0; + }) + + // "in" here is 8x8 tile + T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in); + + // A^T * in + LOOP_UNROLLING(int, i, 0, 1, 8, + { + tmp[0].v = in[8 + i].v + in[16 + i].v; + tmp[1].v = in[8 + i].v - in[16 + i].v; + tmp[2].v = in[24 + i].v + in[32 + i].v; + tmp[3].v = in[24 + i].v - in[32 + i].v; + tmp[3].v = tmp[3].v + tmp[3].v; + tmp[4].v = in[40 + i].v + in[48 + i].v; + tmp[4].v = tmp[4].v + tmp[4].v; + tmp[5].v = in[40 + i].v - in[48 + i].v; + + // 4x8 matrix as a result + in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); + in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); + in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v); + in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v; + }) + + // Compute the output tile + TILE(DATA_TYPE, 16, N0, out); + + // in * A, with in = A^T * in as above + LOOP_UNROLLING(int, i, 0, 1, 4, + { + tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v; + tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v; + tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v; + tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v; + tmp[3].v = tmp[3].v + tmp[3].v; + tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v; + tmp[4].v = tmp[4].v + tmp[4].v; + tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v; + + // 4x4 tile + out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v); + out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v); + out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v; + out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v; + }) + +#if defined(HAS_BIAS) + TILE(DATA_TYPE, 1, N0, b); + + T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b); + + // c = c + bias[broadcasted] + T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out); +#endif // HAS_BIAS + + int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W; + int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H; + + T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out); + + TILE(uint, 16, 1, dst_indirect_y); + + // Calculate the destination indirect Y + LOOP_UNROLLING(int, yk, 0, 1, 4, + { + LOOP_UNROLLING(int, xk, 0, 1, 4, + { + int x_c = min(x_out + xk, ((int)DST_WIDTH - 1)); + int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1)); + dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH; + dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT); + }) + }) + + // Store the tile in reverse order so the invalid values are overwritten with the valid ones + T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y); +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_2x1_7x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_2x2_7x7_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x1_3x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_3x3_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_4x1_5x1_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_5x5_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) + +#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#if defined(VEC_SIZE) && VEC_SIZE == 2 +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x2_1x7_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_2x2_7x7_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 2 + +#if defined(VEC_SIZE) && VEC_SIZE == 4 +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x4_1x3_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_3x3_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} + +/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC + * + * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16 + * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1 + * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4 + * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24 + * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32 + * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time + * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half. + * + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16 + * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr + * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes) + * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes) + * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor + */ +__kernel void winograd_output_transform_1x4_1x5_nhwc( + TENSOR4D_DECLARATION(src), + TENSOR4D_DECLARATION(dst), +#if defined(HAS_BIAS) + VECTOR_DECLARATION(bias), +#endif // defined(HAS_BIAS) + int dst_size) +{ + winograd_output_transform_4x4_5x5_nhwc(src_ptr, + src_stride_x, + src_step_x, + src_stride_y, + src_step_y, + src_stride_z, + src_step_z, + src_stride_w, + src_step_w, + src_offset_first_element_in_bytes, + dst_ptr, + dst_stride_x, + dst_step_x, + dst_stride_y, + dst_step_y, + dst_stride_z, + dst_step_z, + dst_stride_w, + dst_step_w, + dst_offset_first_element_in_bytes, +#if defined(HAS_BIAS) + bias_ptr, + bias_stride_x, + bias_step_x, + bias_offset_first_element_in_bytes, +#endif // defined(HAS_BIAS) + dst_size); +} +#endif // defined(VEC_SIZE) && VEC_SIZE == 4 +#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL) +#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl deleted file mode 100644 index d63a2e51e8..0000000000 --- a/src/core/CL/cl_kernels/pooling_layer.cl +++ /dev/null @@ -1,981 +0,0 @@ -/* - * Copyright (c) 2017-2021 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" -#include "repeat.h" -#include "tile_helpers.h" - -#if defined(POOL_AVG) || defined(POOL_L2) -#define POOL_OP(x, y) ((x) + (y)) -#else /* defined(POOL_AVG) || defined(POOL_L2) */ -#define POOL_OP(x, y) (fmax((x), (y))) -#endif /* defined(POOL_AVG) || defined(POOL_L2) */ - -#if defined(POOL_L2) -#define POW2_OP(x, vec_size) ((x) * (x)) -#else /* defined(POOL_L2) */ -#define POW2_OP(x, vec_size) (x) -#endif /* defined(POOL_L2) */ - -#define DIV_OP(x, y) (x * (1.f / y)) -#define SQRT_OP(x) sqrt((x)) - -#if STRIDE_X == 1 -#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output) -#elif STRIDE_X == 2 /* STRIDE_X == 1 */ -#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output) -#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */ -#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output) -#endif /* STRIDE_X == 3 */ - -#if defined(FP_MIXED_PRECISION) -#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n)) -#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \ - CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n) -#else /* defined(FP_MIXED_PRECISION) */ -#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr) -#endif /* defined(FP_MIXED_PRECISION) */ - -#define POOLING3x3_STRIDE1(res, input, output) \ - ({ \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \ - data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \ - data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \ - data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \ - data00 = POW2_OP(data00, 4); \ - data01 = POW2_OP(data01, 2); \ - data10 = POW2_OP(data10, 4); \ - data11 = POW2_OP(data11, 2); \ - data20 = POW2_OP(data20, 4); \ - data21 = POW2_OP(data21, 2); \ - \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \ - \ - values00 = POOL_OP(values00, values10); \ - values01 = POOL_OP(values01, values11); \ - values00 = POOL_OP(values00, values20); \ - values01 = POOL_OP(values01, values21); \ - \ - res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \ - res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \ - }) - -#define POOLING3x3_STRIDE2(res, input, output) \ - ({ \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ - ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ - ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ - ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8)); \ - data00 = POW2_OP(data00, 8); \ - data01 = POW2_OP(data01, 1); \ - data10 = POW2_OP(data10, 8); \ - data11 = POW2_OP(data11, 1); \ - data20 = POW2_OP(data20, 8); \ - data21 = POW2_OP(data21, 1); \ - \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21); \ - \ - values00 = POOL_OP(values00, values10); \ - values01 = POOL_OP(values01, values11); \ - values00 = POOL_OP(values00, values20); \ - values01 = POOL_OP(values01, values21); \ - \ - res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \ - res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \ - }) - -#define POOLING3x3_STRIDE3(res, input, output) \ - ({ \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \ - data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \ - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \ - data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \ - data00 = POW2_OP(data00, 8); \ - data01 = POW2_OP(data01, 4); \ - data10 = POW2_OP(data10, 8); \ - data11 = POW2_OP(data11, 4); \ - data20 = POW2_OP(data20, 8); \ - data21 = POW2_OP(data21, 4); \ - \ - data00 = POOL_OP(data00, data10); \ - data01 = POOL_OP(data01, data11); \ - data00 = POOL_OP(data00, data20); \ - data01 = POOL_OP(data01, data21); \ - \ - res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \ - res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03)); \ - }) - -ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) -{ - int start_x = get_global_id(0) * stride_x - pad_x; - int start_y = get_global_id(1) * stride_y - pad_y; - const int end_x = min(start_x + pool_size_x, upper_bound_w); - const int end_y = min(start_y + pool_size_y, upper_bound_h); -#if defined(EXCLUDE_PADDING) - start_x = max(0, start_x); - start_y = max(0, start_y); -#endif /* defined(EXCLUDE_PADDING) */ - return ((end_y - start_y) * (end_x - start_x)); -} - -/** Performs a pooling function of pool size equal to 2. - * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; - * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. - * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void pooling_layer_2( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - // Load data - VEC_DATA_TYPE(ACC_DATA_TYPE, 2) - data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); - VEC_DATA_TYPE(ACC_DATA_TYPE, 2) - data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); - -#if defined(POOL_L2) - // Raise to power of 2 for L2 Pooling - data0 = POW2_OP(data0, 2); - data1 = POW2_OP(data1, 2); -#endif /* defined(POOL_L2) */ - - // Perform calculations - data0 = POOL_OP(data0, data1); - ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1); - -#if defined(POOL_AVG) || defined(POOL_L2) - // Divide by pool region in case of average or l2 pooling - res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); -#endif /* defined(POOL_AVG) || defined(POOL_L2) */ - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res = SQRT_OP(res); -#endif /* defined(POOL_L2) */ - - // Store result - *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res; -} - -/** Performs a pooling function of pool size equal to 3 - * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; - * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. - * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void pooling_layer_3( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - // Load data - VEC_DATA_TYPE(ACC_DATA_TYPE, 3) - data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); - VEC_DATA_TYPE(ACC_DATA_TYPE, 3) - data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); - VEC_DATA_TYPE(ACC_DATA_TYPE, 3) - data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); - -#if defined(POOL_L2) - // Raise to power of 2 for L2 Pooling - data0 = POW2_OP(data0, 3); - data1 = POW2_OP(data1, 3); - data2 = POW2_OP(data2, 3); -#endif /* defined(POOL_L2) */ - - // Perform calculations - data0 = POOL_OP(data0, data1); - data0 = POOL_OP(data0, data2); - ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2); - -#if defined(POOL_AVG) || defined(POOL_L2) - // Divide by pool region in case of average pooling - res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); -#endif /* defined(POOL_AVG) || defined(POOL_L2) */ - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res = SQRT_OP(res); -#endif /* defined(POOL_L2) */ - - // Store result - *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res; -} - -#if defined(POOLING3x3) - -#define CONVERT_OP(data_type) convert_##data_type##4 -#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type) - -VEC_DATA_TYPE(ACC_DATA_TYPE, 4) -calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h, - const int pad_x, const int pad_y, const int stride_x, const int stride_y) -{ - int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x; - int start_y = get_global_id(1) * stride_y - pad_y; - const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w); - const int end_y = min(start_y + pool_size, upper_bound_h); -#if defined(EXCLUDE_PADDING) - start_x = max((int4)0, start_x); - start_y = max(0, start_y); -#endif /* defined(EXCLUDE_PADDING) */ - return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x)); -} - -/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3 - * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; - * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed. - * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void pooling_layer_optimized_3( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) - res; - - // Perform pooling 3x3 for 4 output elements - POOLING3x3(res, input, output); - -#if defined(POOL_AVG) || defined(POOL_L2) - // Divide by pool region in case of average pooling - res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y); -#endif /* defined(POOL_AVG) || defined(POOL_L2) */ - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res = SQRT_OP(res); -#endif /* defined(POOL_L2) */ - - vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr); -} -#endif // defined(POOLING3x3) - -#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) - -/** Performs a pooling function of pool size equal to N (NCHW) - * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32; - * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; - * @note In case of average pooling the following information must be passed at compile time: - * -DPOOL_AVG must be provided otherwise max pooling will be performed. - * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad) - * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension - * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void pooling_layer_MxN_nchw( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) - vdata = INITIAL_VALUE; - ACC_DATA_TYPE sdata = INITIAL_VALUE; - - // Load data - for(int y = 0; y < POOL_SIZE_Y; y++) - { - int x = 0; - for(; x <= ((int)POOL_SIZE_X - 8); x += 8) - { - VEC_DATA_TYPE(ACC_DATA_TYPE, 8) - data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)); -#if defined(POOL_L2) - // Raise to power of 2 for L2 Pooling - data0 *= data0; -#endif /* defined(POOL_L2) */ - vdata = POOL_OP(vdata, data0); - } - - // Leftover - for(; x < (int)POOL_SIZE_X; ++x) - { - ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0))); -#if defined(POOL_L2) - // Raise to power of 2 for L2 Pooling - data0 *= data0; -#endif /* defined(POOL_L2) */ - sdata = POOL_OP(sdata, data0); - } - } - - // Reduce result - VEC_DATA_TYPE(ACC_DATA_TYPE, 4) - reduce4 = POOL_OP(vdata.s0123, vdata.s4567); - VEC_DATA_TYPE(ACC_DATA_TYPE, 2) - reduce2 = POOL_OP(reduce4.s01, reduce4.s23); - ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1); - res = POOL_OP(res, sdata); - -#if defined(POOL_AVG) || defined(POOL_L2) - // Divide by pool region in case of average pooling - res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)); -#endif /* defined(POOL_AVG) || defined(POOL_L2) */ - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res = SQRT_OP(res); -#endif /* defined(POOL_L2) */ - - // Store result - *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res; -} -#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) - -#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) - -inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom) -{ - const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT; - const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM; - - const int x = get_global_id(0) * STRIDE_X; - const int y = get_global_id(1) * STRIDE_Y; - const int z = get_global_id(2); - - //x axis: width, y axis: height, z axis: component - const uint padded_offset = input->offset_first_element_in_bytes - + x * input->stride_x - + y * input->stride_y - + z * input->stride_z; - - const uint offset_base = padded_offset - - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */ - - PAD_TENSOR_TOP * input->stride_y /* top padding */ - - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */ - - PAD_TENSOR_LEFT * sizeof(DATA_TYPE); - -#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) - *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT)); -#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ - *offset_top = (uint)(offset_base / sizeof(DATA_TYPE)); -#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */ - - *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz; - - return; -} - -#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) - -/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. - * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32 - * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; - * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT - * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 - * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) - * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) - * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) - * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor - */ -__kernel void pooling_layer_2_nchw_indices_fp32( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output), - TENSOR3D_DECLARATION(indices)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); - - // Load data - float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0)); - float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0)); - - // Perform calculations - float data0_max = POOL_OP(data0.s0, data0.s1); - float data1_max = POOL_OP(data1.s0, data1.s1); - float res = POOL_OP(data0_max, data1_max); - // Store result - *(__global float *)output.ptr = res; - -#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) - - uint offset_top = 0; - uint offset_bottom = 0; - - offset_no_padding_nchw(&input, &offset_top, &offset_bottom); - - uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); - uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); - uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); - - *(__global uint *)indices.ptr = index; - -#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) -} - -/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW. - * - * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16 - * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13; - * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT - * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32 - * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes) - * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes) - * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes) - * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor - */ -__kernel void pooling_layer_2_nchw_indices_fp16( - TENSOR3D_DECLARATION(input), - TENSOR3D_DECLARATION(output), - TENSOR3D_DECLARATION(indices)) -{ - // Get pixels pointer - Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); - Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); - Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices); - - // Load data - half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0)); - half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0)); - - // Perform calculations - half data0_max = POOL_OP(data0.s0, data0.s1); - half data1_max = POOL_OP(data1.s0, data1.s1); - half res = POOL_OP(data0_max, data1_max); - // Store result - *(__global half *)output.ptr = res; - -#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) - - uint offset_top = 0; - uint offset_bottom = 0; - - offset_no_padding_nchw(&input, &offset_top, &offset_bottom); - - uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1)); - uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1)); - uint index = select(index1, index0, isgreaterequal(data0_max, data1_max)); - - *(__global uint *)indices.ptr = index; - -#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM) -} - -#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) - -#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) -/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types: - * -# max, -DPOOL_MAX must be passed at compile time - * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time - * -# l2 normalisation, -DPOOL_L2 must be passed at compile time - * - * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16 - * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float - * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result - * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4 - * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT - * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE - * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y - * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE - * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - */ -__kernel void pooling_layer_MxN_nhwc( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output)) -{ - // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 - // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side - int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER); - int idx_out_w = GET_SPATIAL_IDX(1, 1, 0); -#if DST_BATCH_SIZE != 1 - // If batch size != 1, the batch size dimension is collapsed over the height dimension - int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT; - int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT; -#else //DST_BATCH_SIZE != 1 - int idx_out_h = GET_SPATIAL_IDX(2, 1, 0); - int idx_out_n = 0; -#endif // DST_BATCH_SIZE != 1 - - __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w; - - __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * - output_stride_w; - - VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) - res0 = INITIAL_VALUE; - - int idx_in_w = idx_out_w * STRIDE_X - PAD_X; - int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; - - int pool_x_s = max((int)0, -idx_in_w); - int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w); - int pool_y_s = max((int)0, -idx_in_h); - int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h); - -#if defined(EXCLUDE_PADDING) - int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s); -#else // defined(EXCLUDE_PADDING) - int filter_size = POOL_SIZE_X * POOL_SIZE_Y; -#endif // defined(EXCLUDE_PADDING) - -#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 - // Global pooling path - for(int y = 0; y < POOL_SIZE_Y; ++y) - { -#pragma unroll 8 - for(int x = 0; x < POOL_SIZE_X; ++x) - { -#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 - for(int y = pool_y_s; y < pool_y_e; ++y) - { -#pragma unroll 8 - for(int x = pool_x_s; x < pool_x_e; ++x) - { -#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0 - VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) - data0; -#if defined(FP_MIXED_PRECISION) - // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE - data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); -#else // defined(FP_MIXED_PRECISION) - data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)); -#endif // defined(FP_MIXED_PRECISION) - -#if defined(POOL_L2) - // Raise to power of 2 for L2 Pooling - data0 *= data0; -#endif // defined(POOL_L2) - res0 = POOL_OP(res0, data0); - } - } - -#if defined(POOL_AVG) || defined(POOL_L2) - res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; -#endif // defined(POOL_AVG) || defined(POOL_L2) - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res0 = SQRT_OP(res0); -#endif // defined(POOL_L2) - - // Store result -#if defined(FP_MIXED_PRECISION) - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); - STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); -#else // defined(FP_MIXED_PRECISION) - STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); -#endif // defined(FP_MIXED_PRECISION) -} -#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y) - -#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) - -/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types: - * -# max, -DPOOL_MAX must be passed at compile time - * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time - * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time - * -# l2 normalisation, -DPOOL_L2 must be passed at compile time - * - * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16 - * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float - * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result - * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT - * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE - * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions - * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y - * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 - * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE - * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0 - * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16 - * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) - * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) - * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) - * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes) - * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor - * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr - * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) - * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) - * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes) - * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes) - * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor - * @param[in] indices_ptr (Optional) Pointer to the indices tensor. Supported data types: U32 - * @param[in] indices_stride_x (Optional) Stride of the indices tensor in X dimension (in bytes) - * @param[in] indices_step_x (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] indices_stride_y (Optional) Stride of the indices tensor in Y dimension (in bytes) - * @param[in] indices_step_y (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] indices_stride_z (Optional) Stride of the indices tensor in Z dimension (in bytes) - * @param[in] indices_step_z (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes) - * @param[in] indices_stride_w (Optional) Stride of the indices tensor in W dimension (in bytes) - * @param[in] indices_step_w (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes) - * @param[in] indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor - */ -__kernel void pooling_layer_2x2_nhwc( - TENSOR4D_DECLARATION(input), - TENSOR4D_DECLARATION(output) -#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) - , - TENSOR4D_DECLARATION(indices) -#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) -) -{ - // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0 - // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side - int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0); - int idx_out_w = get_global_id(1); -#if DST_BATCH_SIZE != 1 - // If batch size != 1, the batch size dimension is collapsed over the height dimension - int idx_out_h = get_global_id(2) % DST_HEIGHT; - int idx_out_n = get_global_id(2) / DST_HEIGHT; -#else //SRC_BATCH_SIZE != 1 - int idx_out_h = get_global_id(2); - int idx_out_n = 0; -#endif // SRC_BATCH_SIZE != 1 - - int idx_in_w = idx_out_w * STRIDE_X - PAD_X; - int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y; - - __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w; - - __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n * - output_stride_w; - - int pool_x_s = max((int)0, -idx_in_w); - int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w); - int pool_y_s = max((int)0, -idx_in_h); - int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h); - - int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s); - - int x0 = pool_x_s + idx_in_w; - int y0 = pool_y_s + idx_in_h; - int x1 = pool_x_e - 1 + idx_in_w; - int y1 = pool_y_e - 1 + idx_in_h; - - REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0); - -#if defined(FP_MIXED_PRECISION) - // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE - data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); - data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); - data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); - data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)); -#else // defined(FP_MIXED_PRECISION) - data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)); - data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)); - data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)); - data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)); -#endif // defined(FP_MIXED_PRECISION) - -#if !defined(POOL_MAX) - if(filter_size != 4) - { - SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0; - SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1); - SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0; - SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1); - - // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound) - data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s)); - data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s)); - data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e)); - data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e)); - } -#endif // !defined(POOL_MAX) - -#if defined(POOL_L2) - // Raise to power of 2 for L2 Pooling - data0 *= data0; - data1 *= data1; - data2 *= data2; - data3 *= data3; -#endif /* defined(POOL_L2) */ - - VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE) - res0 = data0; - res0 = POOL_OP(res0, data1); - res0 = POOL_OP(res0, data2); - res0 = POOL_OP(res0, data3); - -#if defined(POOL_AVG) || defined(POOL_L2) -#if defined(EXCLUDE_PADDING) - res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size; -#else // !defined(EXCLUDE_PADDING) - res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4; -#endif // defined(EXCLUDE_PADDING) -#endif // defined(POOL_AVG) || defined(POOL_L2) - -#if defined(POOL_L2) - // Take square root of the result in L2 pooling - res0 = SQRT_OP(res0); -#endif // defined(POOL_L2) - - // Store result -#if defined(FP_MIXED_PRECISION) - VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) - res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)); - STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); -#else // defined(FP_MIXED_PRECISION) - STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0); -#endif // defined(FP_MIXED_PRECISION) - -#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) - - // This part is used to return the index of the maximum value - // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor - - // note: Batch dimension does not contribute in the offset contribution - VEC_DATA_TYPE(uint, VEC_SIZE) - base_index = (uint)idx_out_c; - - base_index += VEC_OFFS(uint, VEC_SIZE); - - VEC_DATA_TYPE(uint, VEC_SIZE) - index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH); - VEC_DATA_TYPE(uint, VEC_SIZE) - index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH); - VEC_DATA_TYPE(uint, VEC_SIZE) - index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH); - VEC_DATA_TYPE(uint, VEC_SIZE) - index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH); - - index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE))); - index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE))); - index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE))); - - __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n * - indices_stride_w; - - // Store result - STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0)); -#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX) -} -#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
\ No newline at end of file diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl deleted file mode 100644 index 7983734fc4..0000000000 --- a/src/core/CL/cl_kernels/sobel_filter.cl +++ /dev/null @@ -1,541 +0,0 @@ -/* - * Copyright (c) 2016, 2017 Arm Limited. - * - * SPDX-License-Identifier: MIT - * - * Permission is hereby granted, free of charge, to any person obtaining a copy - * of this software and associated documentation files (the "Software"), to - * deal in the Software without restriction, including without limitation the - * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or - * sell copies of the Software, and to permit persons to whom the Software is - * furnished to do so, subject to the following conditions: - * - * The above copyright notice and this permission notice shall be included in all - * copies or substantial portions of the Software. - * - * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR - * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, - * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE - * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER - * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, - * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE - * SOFTWARE. - */ -#include "helpers.h" - -/***********************************************/ -/* Begin implementation of Sobel3x3 filter */ -/***********************************************/ - -/** This OpenCL kernel that computes a Sobel3x3 filter. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void sobel3x3( - IMAGE_DECLARATION(src) -#ifdef GRAD_X - , - IMAGE_DECLARATION(dst_gx) -#endif /* GRAD_X */ -#ifdef GRAD_Y - , - IMAGE_DECLARATION(dst_gy) -#endif /* GRAD_Y */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#ifdef GRAD_X - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - - // Output pixels -#ifdef GRAD_X - short8 gx = (short8)0; -#endif /* GRAD_X */ -#ifdef GRAD_Y - short8 gy = (short8)0; -#endif /* GRAD_Y */ - - // Row0 - uchar16 temp = vload16(0, offset(&src, -1, -1)); - short8 left = convert_short8(temp.s01234567); - short8 middle = convert_short8(temp.s12345678); - short8 right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-1); - gx += right * (short8)(+1); -#endif /* GRAD_X */ -#ifdef GRAD_Y - gy += left * (short8)(-1); - gy += middle * (short8)(-2); - gy += right * (short8)(-1); -#endif /* GRAD_Y */ - - // Row1 - temp = vload16(0, offset(&src, -1, 0)); - left = convert_short8(temp.s01234567); - right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-2); - gx += right * (short8)(+2); -#endif /* GRAD_X */ - - // Row2 - temp = vload16(0, offset(&src, -1, 1)); - left = convert_short8(temp.s01234567); - middle = convert_short8(temp.s12345678); - right = convert_short8(temp.s23456789); -#ifdef GRAD_X - gx += left * (short8)(-1); - gx += right * (short8)(+1); -#endif /* GRAD_X */ -#ifdef GRAD_Y - gy += left * (short8)(+1); - gy += middle * (short8)(+2); - gy += right * (short8)(+1); -#endif /* GRAD_Y */ - - // Store results -#ifdef GRAD_X - vstore8(gx, 0, ((__global short *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - vstore8(gy, 0, ((__global short *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} - -/**********************************************/ -/* End implementation of Sobel3x3 filter */ -/**********************************************/ - -/***********************************************/ -/* Begin implementation of Sobel5x5 filter */ -/***********************************************/ - -/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] left1_coeff_gx Weight of the most left pixel for gx - * @param[in] left2_coeff_gx Weight of the left pixel for gx - * @param[in] middle_coeff_gx Weight of the middle pixel for gx - * @param[in] right1_coeff_gx Weight of the right pixel for gx - * @param[in] right2_coeff_gx Weight of the most right pixel for gx - * @param[in] left1_coeff_gy Weight of the most left pixel for gy - * @param[in] left2_coeff_gy Weight of the left pixel for gy - * @param[in] middle_coeff_gy Weight of the middle pixel for gy - * @param[in] right1_coeff_gy Weight of the right pixel for gy - * @param[in] right2_coeff_gy Weight of the most right pixel for gy - * - * @return a short16 containing short8 gx and short8 gy values. - */ -short16 sobel1x5( - Image *src, - const short left1_coeff_gx, - const short left2_coeff_gx, - const short middle_coeff_gx, - const short right1_coeff_gx, - const short right2_coeff_gx, - const short left1_coeff_gy, - const short left2_coeff_gy, - const short middle_coeff_gy, - const short right1_coeff_gy, - const short right2_coeff_gy) -{ - uchar16 temp = vload16(0, offset(src, -2, 0)); - short8 gx = 0; - short8 gy = 0; - short8 val; - - val = convert_short8(temp.s01234567); - gx += val * (short8)left1_coeff_gx; - gy += val * (short8)left1_coeff_gy; - - val = convert_short8(temp.s12345678); - gx += val * (short8)left2_coeff_gx; - gy += val * (short8)left2_coeff_gy; - - val = convert_short8(temp.s23456789); - gx += val * (short8)middle_coeff_gx; - gy += val * (short8)middle_coeff_gy; - - val = convert_short8(temp.s3456789a); - gx += val * (short8)right1_coeff_gx; - gy += val * (short8)right1_coeff_gy; - - val = convert_short8(temp.s456789ab); - gx += val * (short8)right2_coeff_gx; - gy += val * (short8)right2_coeff_gy; - - return (short16)(gx, gy); -} - -/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels). - * - * @param[in] src Pointer to source image. - * @param[in] up1_coeff Weight of the most up pixel - * @param[in] up2_coeff Weight of the up pixel - * @param[in] middle_coeff Weight of the middle pixel - * @param[in] down1_coeff Weight of the down pixel - * @param[in] down2_coeff Weight of the most down pixel - * - * @return a short8 containing 8 convoluted values. - */ -short8 sobel5x1( - Image *src, - const short up1_coeff, - const short up2_coeff, - const short middle_coeff, - const short down1_coeff, - const short down2_coeff) -{ - short8 val; - short8 out = (short8)0; - - val = vload8(0, (__global short *)offset(src, 0, -2)); - out += val * (short8)up1_coeff; - - val = vload8(0, (__global short *)offset(src, 0, -1)); - out += val * (short8)up2_coeff; - - val = vload8(0, (__global short *)offset(src, 0, 0)); - out += val * (short8)middle_coeff; - - val = vload8(0, (__global short *)offset(src, 0, 1)); - out += val * (short8)down1_coeff; - - val = vload8(0, (__global short *)offset(src, 0, 2)); - out += val * (short8)down2_coeff; - - return (short8)(out); -} - -/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_ptr Pointer to the source image.. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image.. Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void sobel_separable1x5( - IMAGE_DECLARATION(src) -#ifdef GRAD_X - , - IMAGE_DECLARATION(dst_gx) -#endif /* GRAD_X */ -#ifdef GRAD_Y - , - IMAGE_DECLARATION(dst_gy) -#endif /* GRAD_Y */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#ifdef GRAD_X - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - - // Output pixels - short16 gx_gy = sobel1x5(&src, - -1, -2, 0, 2, 1, - 1, 4, 6, 4, 1); - - // Store result in dst -#ifdef GRAD_X - vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} - -/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images - * and output two single channel S16 images. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_x_ptr Pointer to the source image.. Supported data types: S16 - * @param[in] src_x_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_x_step_x src_x_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_x_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_x_step_y src_x_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_x_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] src_y_ptr Pointer to the source image. Supported data types: S16 - * @param[in] src_y_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_y_step_x src_y_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_y_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_y_step_y src_y_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_y_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] dummy Dummy parameter to easy conditional inclusion - */ -__kernel void sobel_separable5x1( -#ifdef GRAD_X - IMAGE_DECLARATION(src_x), - IMAGE_DECLARATION(dst_gx), -#endif /* GRAD_X */ -#ifdef GRAD_Y - IMAGE_DECLARATION(src_y), - IMAGE_DECLARATION(dst_gy), -#endif /* GRAD_Y */ - int dummy) -{ -#ifdef GRAD_X - Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x); - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y); - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - -#ifdef GRAD_X - short8 gx = sobel5x1(&src_x, - 1, 4, 6, 4, 1); - vstore8(gx, 0, ((__global short *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - short8 gy = sobel5x1(&src_y, - -1, -2, 0, 2, 1); - vstore8(gy, 0, ((__global short *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} - -/**********************************************/ -/* End implementation of Sobel5x5 filter */ -/**********************************************/ - -/***********************************************/ -/* Begin implementation of Sobel7x7 filter */ -/***********************************************/ - -/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */ -#define X0 -1 -#define X1 -4 -#define X2 -5 -#define X3 0 -#define X4 5 -#define X5 4 -#define X6 1 - -/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */ -#define Y0 1 -#define Y1 6 -#define Y2 15 -#define Y3 20 -#define Y4 15 -#define Y5 6 -#define Y6 1 - -/* Calculates single horizontal iteration. */ -#define SOBEL1x1_HOR(src, gx, gy, idx) \ - { \ - int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \ - gx += val * X##idx; \ - gy += val * Y##idx; \ - } - -/* Calculates single vertical iteration. */ -#define SOBEL1x1_VERT(src, g, direction, idx) \ - { \ - int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \ - g += val * (int8)direction##idx; \ - } - -/* Calculates a 1x7 horizontal iteration. */ -#define SOBEL1x7(ptr, gx, gy) \ - SOBEL1x1_HOR(ptr, gx, gy, 0) \ - SOBEL1x1_HOR(ptr, gx, gy, 1) \ - SOBEL1x1_HOR(ptr, gx, gy, 2) \ - SOBEL1x1_HOR(ptr, gx, gy, 3) \ - SOBEL1x1_HOR(ptr, gx, gy, 4) \ - SOBEL1x1_HOR(ptr, gx, gy, 5) \ - SOBEL1x1_HOR(ptr, gx, gy, 6) - -/* Calculates a 7x1 vertical iteration. */ -#define SOBEL7x1(ptr, g, direction) \ - SOBEL1x1_VERT(ptr, g, direction, 0) \ - SOBEL1x1_VERT(ptr, g, direction, 1) \ - SOBEL1x1_VERT(ptr, g, direction, 2) \ - SOBEL1x1_VERT(ptr, g, direction, 3) \ - SOBEL1x1_VERT(ptr, g, direction, 4) \ - SOBEL1x1_VERT(ptr, g, direction, 5) \ - SOBEL1x1_VERT(ptr, g, direction, 6) - -/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_ptr Pointer to the source image. Supported data types: U8 - * @param[in] src_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S32 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S32 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - */ -__kernel void sobel_separable1x7( - IMAGE_DECLARATION(src) -#ifdef GRAD_X - , - IMAGE_DECLARATION(dst_gx) -#endif /* GRAD_X */ -#ifdef GRAD_Y - , - IMAGE_DECLARATION(dst_gy) -#endif /* GRAD_Y */ -) -{ - Image src = CONVERT_TO_IMAGE_STRUCT(src); -#ifdef GRAD_X - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - int8 gx = (int8)0; - int8 gy = (int8)0; - - SOBEL1x7(&src, gx, gy); - - // Store result in dst -#ifdef GRAD_X - vstore8(gx, 0, ((__global int *)dst_gx.ptr)); -#endif /* GRAD_X */ -#ifdef GRAD_Y - vstore8(gy, 0, ((__global int *)dst_gy.ptr)); -#endif /* GRAD_Y */ -} - -/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined. - * - * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient - * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required. - * - * @param[in] src_x_ptr Pointer to the source image. Supported data types: S32 - * @param[in] src_x_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_x_step_x src_x_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_x_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_x_step_y src_x_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_x_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] src_y_ptr Pointer to the source image. Supported data types: S32 - * @param[in] src_y_stride_x Stride of the source image in X dimension (in bytes) - * @param[in] src_y_step_x src_y_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] src_y_stride_y Stride of the source image in Y dimension (in bytes) - * @param[in] src_y_step_y src_y_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] src_y_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16 - * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes) - * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes) - * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes) - * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes) - * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] dummy Dummy parameter to easy conditional inclusion - */ -__kernel void sobel_separable7x1( -#ifdef GRAD_X - IMAGE_DECLARATION(src_x), - IMAGE_DECLARATION(dst_gx), -#endif /* GRAD_X */ -#ifdef GRAD_Y - IMAGE_DECLARATION(src_y), - IMAGE_DECLARATION(dst_gy), -#endif /* GRAD_Y */ - int dummy) -{ -#ifdef GRAD_X - Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x); - Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx); -#endif /* GRAD_X */ -#ifdef GRAD_Y - Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y); - Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy); -#endif /* GRAD_Y */ - - // Output pixels -#ifdef GRAD_X - int8 gx = 0; - SOBEL7x1(&src_x, gx, Y); - vstore8(gx, 0, (__global int *)dst_gx.ptr); -#endif /* GRAD_X */ -#ifdef GRAD_Y - int8 gy = 0; - SOBEL7x1(&src_y, gy, X); - vstore8(gy, 0, (__global int *)dst_gy.ptr); -#endif /* GRAD_Y */ -} - -/**********************************************/ -/* End implementation of Sobel7x7 filter */ -/**********************************************/ diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp index 73da93c1f5..1ba307cb1c 100644 --- a/src/core/gpu/cl/ClKernelLibrary.cpp +++ b/src/core/gpu/cl/ClKernelLibrary.cpp @@ -177,484 +177,508 @@ namespace opencl { const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = { - { "activation_layer", "activation_layer.cl" }, - { "activation_layer_quant", "activation_layer_quant.cl" }, - { "activation_layer_quant_f32", "activation_layer_quant.cl" }, - { "arg_min_max_x", "arg_min_max.cl" }, - { "arg_min_max_y", "arg_min_max.cl" }, - { "arg_min_max_z", "arg_min_max.cl" }, - { "arg_min_max_w", "arg_min_max.cl" }, - { "batch_to_space_nchw", "batch_to_space.cl" }, - { "batch_to_space_static_nchw", "batch_to_space.cl" }, - { "batch_to_space_nhwc", "batch_to_space.cl" }, - { "batch_to_space_static_nhwc", "batch_to_space.cl" }, - { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" }, - { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" }, - { "bitwise_or", "bitwise_op.cl" }, - { "bitwise_and", "bitwise_op.cl" }, - { "bitwise_xor", "bitwise_op.cl" }, - { "bitwise_not", "bitwise_op.cl" }, - { "bounding_box_transform", "bounding_box_transform.cl" }, - { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" }, - { "channel_shuffle_nchw", "channel_shuffle.cl" }, - { "channel_shuffle_nhwc", "channel_shuffle.cl" }, - { "compare_equal", "comparisons.cl" }, - { "compare_equal_quantized", "comparisons.cl" }, - { "compare_notequal", "comparisons.cl" }, - { "compare_notequal_quantized", "comparisons.cl" }, - { "compare_greater", "comparisons.cl" }, - { "compare_greater_quantized", "comparisons.cl" }, - { "compare_greaterequal", "comparisons.cl" }, - { "compare_greaterequal_quantized", "comparisons.cl" }, - { "compare_less", "comparisons.cl" }, - { "compare_less_quantized", "comparisons.cl" }, - { "compare_lessequal", "comparisons.cl" }, - { "compare_lessequal_quantized", "comparisons.cl" }, - { "concatenate", "concatenate.cl" }, - { "concatenate_width", "concatenate.cl" }, - { "concatenate_height", "concatenate.cl" }, - { "concatenate_width_x2", "concatenate.cl" }, - { "concatenate_width_x4", "concatenate.cl" }, - { "col2im", "col2im.cl" }, - { "cast_down", "cast.cl" }, - { "cast_up", "cast.cl" }, - { "convert_fc_weights", "convert_fc_weights.cl" }, - { "copy_tensor", "copy_tensor.cl" }, - { "crop_tensor", "crop_tensor.cl" }, - { "deconvolution_reshape", "deconvolution_layer.cl" }, - { "deconvolution_upsample", "deconvolution_layer.cl" }, - { "depth_to_space_nchw", "depth_to_space.cl" }, - { "depth_to_space_nhwc", "depth_to_space.cl" }, - { "dequantization_layer", "dequantization_layer.cl" }, - { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" }, - { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" }, - { "dwc_native_fp_nhwc", "dwc_native_fp_nhwc.cl" }, - { "dwc_native_quantized_nhwc", "dwc_native_quantized_nhwc.cl" }, - { "direct_convolution_nhwc", "direct_convolution.cl" }, - { "direct_convolution1x1", "direct_convolution1x1.cl" }, - { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" }, - { "direct_convolution3x3", "direct_convolution3x3.cl" }, - { "direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl" }, - { "direct_convolution5x5", "direct_convolution5x5.cl" }, - { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" }, - { "direct_convolution_quantized", "direct_convolution_quantized.cl" }, - { "elementwise_operation_ADD", "elementwise_operation.cl" }, - { "elementwise_operation_SUB", "elementwise_operation.cl" }, - { "elementwise_operation_MAX", "elementwise_operation.cl" }, - { "elementwise_operation_MIN", "elementwise_operation.cl" }, - { "elementwise_operation_DIV", "elementwise_operation.cl" }, - { "elementwise_operation_SQUARED_DIFF", "elementwise_operation.cl" }, - { "elementwise_operation_POWER", "elementwise_operation.cl" }, - { "elementwise_operation_PRELU", "elementwise_operation.cl" }, - { "elementwise_operation_AND", "elementwise_operation.cl" }, - { "elementwise_operation_OR", "elementwise_operation.cl" }, - { "elementwise_operation_ADD_quantized", "elementwise_operation_quantized.cl" }, - { "elementwise_operation_SUB_quantized", "elementwise_operation_quantized.cl" }, - { "elementwise_operation_MAX_quantized", "elementwise_operation_quantized.cl" }, - { "elementwise_operation_MIN_quantized", "elementwise_operation_quantized.cl" }, - { "elementwise_operation_DIV_quantized", "elementwise_operation_quantized.cl" }, - { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" }, - { "elementwise_operation_PRELU_quantized", "elementwise_operation_quantized.cl" }, - { "elementwise_unary", "elementwise_unary.cl" }, - { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" }, - { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" }, - { "fft_radix_2_first_stage_axis_0", "fft.cl" }, - { "fft_radix_2_first_stage_axis_1", "fft.cl" }, - { "fft_radix_2_axis_0", "fft.cl" }, - { "fft_radix_2_axis_1", "fft.cl" }, - { "fft_radix_3_first_stage_axis_0", "fft.cl" }, - { "fft_radix_3_first_stage_axis_1", "fft.cl" }, - { "fft_radix_3_axis_0", "fft.cl" }, - { "fft_radix_3_axis_1", "fft.cl" }, - { "fft_radix_4_first_stage_axis_0", "fft.cl" }, - { "fft_radix_4_first_stage_axis_1", "fft.cl" }, - { "fft_radix_4_axis_0", "fft.cl" }, - { "fft_radix_4_axis_1", "fft.cl" }, - { "fft_radix_5_first_stage_axis_0", "fft.cl" }, - { "fft_radix_5_first_stage_axis_1", "fft.cl" }, - { "fft_radix_5_axis_0", "fft.cl" }, - { "fft_radix_5_axis_1", "fft.cl" }, - { "fft_radix_7_first_stage_axis_0", "fft.cl" }, - { "fft_radix_7_first_stage_axis_1", "fft.cl" }, - { "fft_radix_7_axis_0", "fft.cl" }, - { "fft_radix_7_axis_1", "fft.cl" }, - { "fft_radix_8_first_stage_axis_0", "fft.cl" }, - { "fft_radix_8_first_stage_axis_1", "fft.cl" }, - { "fft_radix_8_axis_0", "fft.cl" }, - { "fft_radix_8_axis_1", "fft.cl" }, - { "fft_scale_conj", "fft_scale.cl" }, - { "fill_image_borders_constant", "fill_border.cl" }, - { "fill_image_borders_replicate", "fill_border.cl" }, - { "floor_layer", "floor.cl" }, - { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" }, - { "gather", "gather.cl" }, - { "gemm_ma_f16", "gemm.cl" }, - { "gemm_ma_f32", "gemm.cl" }, - { "gemm_mv", "gemv.cl" }, - { "gemm_mv_quantized", "gemv.cl" }, - { "gemm_mm_interleaved_transposed_f16", "gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f16_acc32", "gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f32", "gemm_v1.cl" }, - { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm_v1.cl" }, - { "gemm_mm_floating_point", "gemm_v1.cl" }, - { "gemm_mm_floating_point_f16_bifrost", "gemm_v1.cl" }, - { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm_v1.cl" }, - { "gemm_mm_floating_point_f32_bifrost", "gemm_v1.cl" }, - { "gemm_mm_floating_point_f32_bifrost_1000", "gemm_v1.cl" }, - { "gemm_mm_native", "gemm.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt", "gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_texture", "gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t_texture", "gemm.cl" }, - { "gemm_lc_vm_f32", "gemm.cl" }, - { "gemm_reshape_lhs_matrix_nt", "gemm.cl" }, - { "gemm_reshape_lhs_matrix_t", "gemm.cl" }, - { "gemm_reshape_rhs_matrix_nt", "gemm.cl" }, - { "gemm_reshape_rhs_matrix_t", "gemm.cl" }, - { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" }, - { "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" }, - { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" }, - { "gemmlowp_mm_native", "gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t", "gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "gemmlowp.cl" }, - { "gemmlowp_offset_contribution", "gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" }, - { "generate_proposals_compute_all_anchors", "generate_proposals.cl" }, - { "generate_proposals_compute_all_anchors_quantized", "generate_proposals_quantized.cl" }, - { "im2col1x1_stridex1_nchw", "im2col.cl" }, - { "im2col3x3_nchw", "im2col.cl" }, - { "im2col5x5_nchw", "im2col.cl" }, - { "im2col11x11_padx0_pady0_nchw", "im2col.cl" }, - { "im2col_generic_nchw", "im2col.cl" }, - { "im2col_generic_padx0_pady0_nchw", "im2col.cl" }, - { "im2col3x3_nhwc", "im2col.cl" }, - { "im2col9x9_nhwc", "im2col.cl" }, - { "im2col_generic_nhwc", "im2col.cl" }, - { "instance_normalization", "instance_normalization.cl" }, - { "compute_mean_var", "instance_normalization.cl" }, - { "l2_normalize_x", "l2_normalize.cl" }, - { "l2_normalize_y", "l2_normalize.cl" }, - { "l2_normalize_z", "l2_normalize.cl" }, - { "max_unpooling_layer_2", "unpooling_layer.cl" }, - { "mean_stddev_normalization", "mean_stddev_normalization.cl" }, - { "memset", "memset.cl" }, - { "minmax_layer", "minmax_layer.cl" }, - { "non_max_suppression", "nonmax.cl" }, - { "normalization_layer_cross_map_nchw", "normalization_layer.cl" }, - { "normalization_layer_cross_map_nhwc", "normalization_layer.cl" }, - { "normalization_layer_in_map_nchw", "normalization_layer.cl" }, - { "normalization_layer_in_map_nhwc", "normalization_layer.cl" }, - { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" }, - { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" }, - { "pad_layer_constant", "pad_layer.cl" }, - { "pad_layer_symmetric_reflect", "pad_layer.cl" }, - { "permute", "permute.cl" }, - { "pixelwise_mul_complex", "pixelwise_mul_float.cl" }, - { "pixelwise_mul_float", "pixelwise_mul_float.cl" }, - { "pixelwise_mul_int", "pixelwise_mul_int.cl" }, - { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" }, - { "pooling_layer_2", "pooling_layer.cl" }, - { "pooling_layer_3", "pooling_layer.cl" }, - { "pooling_layer_optimized_3", "pooling_layer.cl" }, - { "pooling_layer_7", "pooling_layer.cl" }, - { "pooling_layer_MxN_nchw", "pooling_layer.cl" }, - { "pooling_layer_MxN_nhwc", "pooling_layer.cl" }, - { "pooling_layer_2x2_nhwc", "pooling_layer.cl" }, - { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" }, - { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" }, - { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" }, - { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" }, - { "prior_box_layer_nchw", "prior_box_layer.cl" }, - { "qlstm_layer_normalization", "qlstm_layer_normalization.cl" }, - { "quantization_layer", "quantization_layer.cl" }, - { "range", "range.cl" }, - { "range_quantized", "range.cl" }, - { "reduction_operation_x", "reduction_operation.cl" }, - { "reduction_operation_non_parallel_x", "reduction_operation.cl" }, - { "reduction_operation_y", "reduction_operation.cl" }, - { "reduction_operation_z", "reduction_operation.cl" }, - { "reduction_operation_w", "reduction_operation.cl" }, - { "remap_nearest_neighbour_nchw", "remap.cl" }, - { "remap_bilinear_nchw", "remap.cl" }, - { "remap_nearest_neighbour_nhwc", "remap.cl" }, - { "remap_bilinear_nhwc", "remap.cl" }, - { "reorg_layer_nchw", "reorg_layer.cl" }, - { "reorg_layer_nhwc", "reorg_layer.cl" }, - { "reshape_layer", "reshape_layer.cl" }, - { "reshape_to_columns", "convolution_layer.cl" }, - { "reverse", "reverse.cl" }, - { "roi_align_layer", "roi_align_layer.cl" }, - { "roi_align_layer_quantized", "roi_align_layer_quantized.cl" }, - { "roi_pooling_layer", "roi_pooling_layer.cl" }, - { "scale_nearest_neighbour_nchw", "scale.cl" }, - { "scale_nearest_neighbour_nhwc", "scale.cl" }, - { "scale_bilinear_nchw", "scale.cl" }, - { "scale_bilinear_nhwc", "scale.cl" }, - { "scale_bilinear_quantized_nchw", "scale_quantized.cl" }, - { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" }, - { "select_same_rank", "select.cl" }, - { "select_different_rank_2", "select.cl" }, - { "select_different_rank_n", "select.cl" }, - { "softmax_layer_norm", "softmax_layer.cl" }, - { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" }, - { "space_to_batch_nchw", "space_to_batch.cl" }, - { "space_to_batch_static_nchw", "space_to_batch.cl" }, - { "space_to_batch_nhwc", "space_to_batch.cl" }, - { "space_to_batch_static_nhwc", "space_to_batch.cl" }, - { "space_to_depth_nchw", "space_to_depth.cl" }, - { "space_to_depth_nhwc", "space_to_depth.cl" }, - { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" }, - { "stack_layer", "stack_layer.cl" }, - { "strided_slice", "slice_ops.cl" }, - { "tile", "tile.cl" }, - { "transpose", "transpose.cl" }, - { "upsample_layer_nchw", "upsample_layer.cl" }, - { "upsample_layer_nhwc", "upsample_layer.cl" }, - { "winograd_filter_transform_2x2_3x3_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_3x1_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x3_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_3x1_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nchw", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_3x1_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x2_7x7_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_7x1_nhwc", "winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x7_nhwc", "winograd_filter_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz2_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz2_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nchw", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "winograd_input_transform.cl" }, - { "winograd_output_transform_2x2_3x3_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_3x1_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x3_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nchw", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_2x2_7x7_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_7x1_nhwc", "winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" }, + { "activation_layer", "common/activation_layer.cl" }, + { "activation_layer_quant", "common/activation_layer_quant.cl" }, + { "activation_layer_quant_f32", "common/activation_layer_quant.cl" }, + { "arg_min_max_x", "common/arg_min_max.cl" }, + { "arg_min_max_y", "common/arg_min_max.cl" }, + { "arg_min_max_z", "common/arg_min_max.cl" }, + { "arg_min_max_w", "common/arg_min_max.cl" }, + { "batch_to_space_nchw", "nchw/batch_to_space.cl" }, + { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" }, + { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" }, + { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" }, + { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" }, + { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" }, + { "bitwise_or", "common/bitwise_op.cl" }, + { "bitwise_and", "common/bitwise_op.cl" }, + { "bitwise_xor", "common/bitwise_op.cl" }, + { "bitwise_not", "common/bitwise_op.cl" }, + { "bounding_box_transform", "common/bounding_box_transform.cl" }, + { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" }, + { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" }, + { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" }, + { "compare_equal", "common/comparisons.cl" }, + { "compare_equal_quantized", "common/comparisons.cl" }, + { "compare_notequal", "common/comparisons.cl" }, + { "compare_notequal_quantized", "common/comparisons.cl" }, + { "compare_greater", "common/comparisons.cl" }, + { "compare_greater_quantized", "common/comparisons.cl" }, + { "compare_greaterequal", "common/comparisons.cl" }, + { "compare_greaterequal_quantized", "common/comparisons.cl" }, + { "compare_less", "common/comparisons.cl" }, + { "compare_less_quantized", "common/comparisons.cl" }, + { "compare_lessequal", "common/comparisons.cl" }, + { "compare_lessequal_quantized", "common/comparisons.cl" }, + { "concatenate", "common/concatenate.cl" }, + { "concatenate_width", "common/concatenate.cl" }, + { "concatenate_height", "common/concatenate.cl" }, + { "concatenate_width_x2", "common/concatenate.cl" }, + { "concatenate_width_x4", "common/concatenate.cl" }, + { "col2im", "common/col2im.cl" }, + { "cast_down", "common/cast.cl" }, + { "cast_up", "common/cast.cl" }, + { "convert_fc_weights", "common/convert_fc_weights.cl" }, + { "copy_tensor", "common/copy_tensor.cl" }, + { "crop_tensor", "common/crop_tensor.cl" }, + { "deconvolution_reshape", "common/deconvolution_layer.cl" }, + { "deconvolution_upsample", "common/deconvolution_layer.cl" }, + { "depth_to_space_nchw", "nchw/depth_to_space.cl" }, + { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" }, + { "dequantization_layer", "common/dequantization_layer.cl" }, + { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" }, + { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" }, + { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" }, + { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" }, + { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" }, + { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" }, + { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" }, + { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" }, + { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" }, + { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" }, + { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" }, + { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" }, + { "elementwise_operation_ADD", "common/elementwise_operation.cl" }, + { "elementwise_operation_SUB", "common/elementwise_operation.cl" }, + { "elementwise_operation_MAX", "common/elementwise_operation.cl" }, + { "elementwise_operation_MIN", "common/elementwise_operation.cl" }, + { "elementwise_operation_DIV", "common/elementwise_operation.cl" }, + { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" }, + { "elementwise_operation_POWER", "common/elementwise_operation.cl" }, + { "elementwise_operation_PRELU", "common/elementwise_operation.cl" }, + { "elementwise_operation_AND", "common/elementwise_operation.cl" }, + { "elementwise_operation_OR", "common/elementwise_operation.cl" }, + { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_unary", "common/elementwise_unary.cl" }, + { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" }, + { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" }, + { "fft_radix_2_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_2_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_2_axis_0", "common/fft.cl" }, + { "fft_radix_2_axis_1", "common/fft.cl" }, + { "fft_radix_3_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_3_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_3_axis_0", "common/fft.cl" }, + { "fft_radix_3_axis_1", "common/fft.cl" }, + { "fft_radix_4_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_4_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_4_axis_0", "common/fft.cl" }, + { "fft_radix_4_axis_1", "common/fft.cl" }, + { "fft_radix_5_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_5_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_5_axis_0", "common/fft.cl" }, + { "fft_radix_5_axis_1", "common/fft.cl" }, + { "fft_radix_7_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_7_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_7_axis_0", "common/fft.cl" }, + { "fft_radix_7_axis_1", "common/fft.cl" }, + { "fft_radix_8_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_8_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_8_axis_0", "common/fft.cl" }, + { "fft_radix_8_axis_1", "common/fft.cl" }, + { "fft_scale_conj", "common/fft_scale.cl" }, + { "fill_image_borders_constant", "common/fill_border.cl" }, + { "fill_image_borders_replicate", "common/fill_border.cl" }, + { "floor_layer", "common/floor.cl" }, + { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" }, + { "gather", "common/gather.cl" }, + { "gemm_ma_f16", "common/gemm.cl" }, + { "gemm_ma_f32", "common/gemm.cl" }, + { "gemm_mv", "common/gemv.cl" }, + { "gemm_mv_quantized", "common/gemv.cl" }, + { "gemm_mm_interleaved_transposed_f16", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f16_acc32", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f16_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f32", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f32_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f16_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f16_bifrost_acc32", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f32_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f32_bifrost_1000", "common/gemm_v1.cl" }, + { "gemm_mm_native", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" }, + { "gemm_lc_vm_f32", "common/gemm.cl" }, + { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" }, + { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" }, + { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" }, + { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" }, + { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" }, + { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" }, + { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" }, + { "gemmlowp_mm_native", "common/gemmlowp.cl" }, + { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" }, + { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" }, + { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" }, + { "gemmlowp_offset_contribution", "common/gemmlowp.cl" }, + { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" }, + { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" }, + { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" }, + { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" }, + { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" }, + { "im2col3x3_nchw", "nchw/im2col.cl" }, + { "im2col5x5_nchw", "nchw/im2col.cl" }, + { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" }, + { "im2col_generic_nchw", "nchw/im2col.cl" }, + { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" }, + { "im2col3x3_nhwc", "nhwc/im2col.cl" }, + { "im2col9x9_nhwc", "nhwc/im2col.cl" }, + { "im2col_generic_nhwc", "nhwc/im2col.cl" }, + { "instance_normalization", "common/instance_normalization.cl" }, + { "compute_mean_var", "common/instance_normalization.cl" }, + { "l2_normalize_x", "common/l2_normalize.cl" }, + { "l2_normalize_y", "common/l2_normalize.cl" }, + { "l2_normalize_z", "common/l2_normalize.cl" }, + { "max_unpooling_layer_2", "common/unpooling_layer.cl" }, + { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" }, + { "memset", "common/memset.cl" }, + { "minmax_layer", "common/minmax_layer.cl" }, + { "non_max_suppression", "common/nonmax.cl" }, + { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" }, + { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" }, + { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" }, + { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" }, + { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" }, + { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" }, + { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" }, + { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" }, + { "pad_layer_constant", "common/pad_layer.cl" }, + { "pad_layer_symmetric_reflect", "common/pad_layer.cl" }, + { "permute", "common/permute.cl" }, + { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" }, + { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" }, + { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" }, + { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" }, + { "pooling_layer_2", "common/pooling_layer.cl" }, + { "pooling_layer_3", "common/pooling_layer.cl" }, + { "pooling_layer_optimized_3", "common/pooling_layer.cl" }, + { "pooling_layer_7", "common/pooling_layer.cl" }, + { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" }, + { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" }, + { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" }, + { "pooling_layer_2_nchw_indices_fp32", "nchw/pooling_layer.cl" }, + { "pooling_layer_2_nchw_indices_fp16", "nchw/pooling_layer.cl" }, + { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" }, + { "pooling_layer_MxN_quantized_nchw", "nchw/pooling_layer_quantized.cl" }, + { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" }, + { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" }, + { "quantization_layer", "common/quantization_layer.cl" }, + { "range", "common/range.cl" }, + { "range_quantized", "common/range.cl" }, + { "reduction_operation_x", "common/reduction_operation.cl" }, + { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" }, + { "reduction_operation_y", "common/reduction_operation.cl" }, + { "reduction_operation_z", "common/reduction_operation.cl" }, + { "reduction_operation_w", "common/reduction_operation.cl" }, + { "remap_nearest_neighbour_nchw", "nchw/remap.cl" }, + { "remap_bilinear_nchw", "nchw/remap.cl" }, + { "remap_nearest_neighbour_nhwc", "nhwc/remap.cl" }, + { "remap_bilinear_nhwc", "nhwc/remap.cl" }, + { "reorg_layer_nchw", "nchw/reorg_layer.cl" }, + { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" }, + { "reshape_layer", "common/reshape_layer.cl" }, + { "reshape_to_columns", "common/convolution_layer.cl" }, + { "reverse", "common/reverse.cl" }, + { "roi_align_layer", "common/roi_align_layer.cl" }, + { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" }, + { "roi_pooling_layer", "common/roi_pooling_layer.cl" }, + { "scale_nearest_neighbour_nchw", "nchw/scale.cl" }, + { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" }, + { "scale_bilinear_nchw", "nchw/scale.cl" }, + { "scale_bilinear_nhwc", "nhwc/scale.cl" }, + { "scale_bilinear_quantized_nchw", "nchw/scale_quantized.cl" }, + { "scale_bilinear_quantized_nhwc", "nhwc/scale_quantized.cl" }, + { "select_same_rank", "common/select.cl" }, + { "select_different_rank_2", "common/select.cl" }, + { "select_different_rank_n", "common/select.cl" }, + { "softmax_layer_norm", "common/softmax_layer.cl" }, + { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" }, + { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" }, + { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" }, + { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" }, + { "space_to_batch_nchw", "nchw/space_to_batch.cl" }, + { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" }, + { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" }, + { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" }, + { "space_to_depth_nchw", "nchw/space_to_depth.cl" }, + { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" }, + { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" }, + { "stack_layer", "common/stack_layer.cl" }, + { "strided_slice", "common/slice_ops.cl" }, + { "tile", "common/tile.cl" }, + { "transpose", "common/transpose.cl" }, + { "upsample_layer_nchw", "nchw/upsample_layer.cl" }, + { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" }, + { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" }, }; const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = { #ifdef EMBEDDED_KERNELS { - "activation_layer.cl", -#include "./cl_kernels/activation_layer.clembed" + "common/activation_layer.cl", +#include "./cl_kernels/common/activation_layer.clembed" }, { - "activation_layer_quant.cl", -#include "./cl_kernels/activation_layer_quant.clembed" + "common/activation_layer_quant.cl", +#include "./cl_kernels/common/activation_layer_quant.clembed" }, { - "arg_min_max.cl", -#include "./cl_kernels/arg_min_max.clembed" + "common/arg_min_max.cl", +#include "./cl_kernels/common/arg_min_max.clembed" }, { - "batch_to_space.cl", -#include "./cl_kernels/batch_to_space.clembed" + "nchw/batch_to_space.cl", +#include "./cl_kernels/nchw/batch_to_space.clembed" }, { - "bitwise_op.cl", -#include "./cl_kernels/bitwise_op.clembed" + "nhwc/batch_to_space.cl", +#include "./cl_kernels/nhwc/batch_to_space.clembed" }, { - "bounding_box_transform.cl", -#include "./cl_kernels/bounding_box_transform.clembed" + "common/bitwise_op.cl", +#include "./cl_kernels/common/bitwise_op.clembed" }, { - "bounding_box_transform_quantized.cl", -#include "./cl_kernels/bounding_box_transform_quantized.clembed" + "common/bounding_box_transform.cl", +#include "./cl_kernels/common/bounding_box_transform.clembed" }, { - "channel_shuffle.cl", -#include "./cl_kernels/channel_shuffle.clembed" + "common/bounding_box_transform_quantized.cl", +#include "./cl_kernels/common/bounding_box_transform_quantized.clembed" }, { - "col2im.cl", -#include "./cl_kernels/col2im.clembed" + "nchw/channel_shuffle.cl", +#include "./cl_kernels/nchw/channel_shuffle.clembed" }, { - "comparisons.cl", -#include "./cl_kernels/comparisons.clembed" + "nhwc/channel_shuffle.cl", +#include "./cl_kernels/nhwc/channel_shuffle.clembed" }, { - "concatenate.cl", -#include "./cl_kernels/concatenate.clembed" + "common/col2im.cl", +#include "./cl_kernels/common/col2im.clembed" }, { - "convert_fc_weights.cl", -#include "./cl_kernels/convert_fc_weights.clembed" + "common/comparisons.cl", +#include "./cl_kernels/common/comparisons.clembed" }, { - "convolution_layer.cl", -#include "./cl_kernels/convolution_layer.clembed" + "common/concatenate.cl", +#include "./cl_kernels/common/concatenate.clembed" }, { - "copy_tensor.cl", -#include "./cl_kernels/copy_tensor.clembed" + "common/convert_fc_weights.cl", +#include "./cl_kernels/common/convert_fc_weights.clembed" }, { - "crop_tensor.cl", -#include "./cl_kernels/crop_tensor.clembed" + "common/convolution_layer.cl", +#include "./cl_kernels/common/convolution_layer.clembed" }, { - "upsample_layer.cl", -#include "./cl_kernels/upsample_layer.clembed" + "common/copy_tensor.cl", +#include "./cl_kernels/common/copy_tensor.clembed" }, { - "deconvolution_layer.cl", -#include "./cl_kernels/deconvolution_layer.clembed" + "common/crop_tensor.cl", +#include "./cl_kernels/common/crop_tensor.clembed" }, { - "cast.cl", -#include "./cl_kernels/cast.clembed" + "nchw/upsample_layer.cl", +#include "./cl_kernels/nchw/upsample_layer.clembed" }, { - "depth_to_space.cl", -#include "./cl_kernels/depth_to_space.clembed" + "nhwc/upsample_layer.cl", +#include "./cl_kernels/nhwc/upsample_layer.clembed" }, { - "dequantization_layer.cl", -#include "./cl_kernels/dequantization_layer.clembed" + "common/deconvolution_layer.cl", +#include "./cl_kernels/common/deconvolution_layer.clembed" }, { - "direct_convolution1x1.cl", -#include "./cl_kernels/direct_convolution1x1.clembed" + "common/cast.cl", +#include "./cl_kernels/common/cast.clembed" }, { - "direct_convolution3x3.cl", -#include "./cl_kernels/direct_convolution3x3.clembed" + "nchw/depth_to_space.cl", +#include "./cl_kernels/nchw/depth_to_space.clembed" }, { - "direct_convolution5x5.cl", -#include "./cl_kernels/direct_convolution5x5.clembed" + "nhwc/depth_to_space.cl", +#include "./cl_kernels/nhwc/depth_to_space.clembed" }, { - "direct_convolution_quantized.cl", -#include "./cl_kernels/direct_convolution_quantized.clembed" + "common/dequantization_layer.cl", +#include "./cl_kernels/common/dequantization_layer.clembed" }, { - "direct_convolution.cl", -#include "./cl_kernels/direct_convolution.clembed" + "nchw/dequantization_layer.cl", +#include "./cl_kernels/nchw/dequantization_layer.clembed" }, { - "dwc_native_fp_nhwc.cl", -#include "./cl_kernels/dwc_native_fp_nhwc.clembed" + "nhwc/dequantization_layer.cl", +#include "./cl_kernels/nhwc/dequantization_layer.clembed" }, { - "dwc_native_quantized_nhwc.cl", -#include "./cl_kernels/dwc_native_quantized_nhwc.clembed" + "nchw/direct_convolution1x1.cl", +#include "./cl_kernels/nchw/direct_convolution1x1.clembed" }, { - "elementwise_operation.cl", -#include "./cl_kernels/elementwise_operation.clembed" + "nchw/direct_convolution3x3.cl", +#include "./cl_kernels/nchw/direct_convolution3x3.clembed" }, { - "elementwise_operation_quantized.cl", -#include "./cl_kernels/elementwise_operation_quantized.clembed" + "nchw/direct_convolution5x5.cl", +#include "./cl_kernels/nchw/direct_convolution5x5.clembed" }, { - "elementwise_unary.cl", -#include "./cl_kernels/elementwise_unary.clembed" + "nchw/direct_convolution_quantized.cl", +#include "./cl_kernels/nchw/direct_convolution_quantized.clembed" }, { - "fft.cl", -#include "./cl_kernels/fft.clembed" + "nhwc/direct_convolution.cl", +#include "./cl_kernels/nhwc/direct_convolution.clembed" }, { - "fft_digit_reverse.cl", -#include "./cl_kernels/fft_digit_reverse.clembed" + "nhwc/dwc_native_fp_nhwc.cl", +#include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed" }, { - "fft_scale.cl", -#include "./cl_kernels/fft_scale.clembed" + "nhwc/dwc_native_quantized_nhwc.cl", +#include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed" }, { - "fill_border.cl", -#include "./cl_kernels/fill_border.clembed" + "common/elementwise_operation.cl", +#include "./cl_kernels/common/elementwise_operation.clembed" }, { - "floor.cl", -#include "./cl_kernels/floor.clembed" + "common/elementwise_operation_quantized.cl", +#include "./cl_kernels/common/elementwise_operation_quantized.clembed" }, { - "gather.cl", -#include "./cl_kernels/gather.clembed" + "common/elementwise_unary.cl", +#include "./cl_kernels/common/elementwise_unary.clembed" }, { - "gemm.cl", -#include "./cl_kernels/gemm.clembed" + "common/fft.cl", +#include "./cl_kernels/common/fft.clembed" }, { - "gemm_v1.cl", -#include "./cl_kernels/gemm_v1.clembed" + "common/fft_digit_reverse.cl", +#include "./cl_kernels/common/fft_digit_reverse.clembed" }, { - "gemmlowp.cl", -#include "./cl_kernels/gemmlowp.clembed" + "common/fft_scale.cl", +#include "./cl_kernels/common/fft_scale.clembed" }, { - "gemv.cl", -#include "./cl_kernels/gemv.clembed" + "common/fill_border.cl", +#include "./cl_kernels/common/fill_border.clembed" }, { - "generate_proposals.cl", -#include "./cl_kernels/generate_proposals.clembed" + "common/floor.cl", +#include "./cl_kernels/common/floor.clembed" }, { - "generate_proposals_quantized.cl", -#include "./cl_kernels/generate_proposals_quantized.clembed" + "common/gather.cl", +#include "./cl_kernels/common/gather.clembed" + }, + { + "common/gemm.cl", +#include "./cl_kernels/common/gemm.clembed" + }, + { + "common/gemm_v1.cl", +#include "./cl_kernels/common/gemm_v1.clembed" + }, + { + "common/gemmlowp.cl", +#include "./cl_kernels/common/gemmlowp.clembed" + }, + { + "common/gemv.cl", +#include "./cl_kernels/common/gemv.clembed" + }, + { + "common/generate_proposals.cl", +#include "./cl_kernels/common/generate_proposals.clembed" + }, + { + "common/generate_proposals_quantized.cl", +#include "./cl_kernels/common/generate_proposals_quantized.clembed" }, { "helpers.h", @@ -665,184 +689,256 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = #include "./cl_kernels/helpers_asymm.hembed" }, { - "im2col.cl", -#include "./cl_kernels/im2col.clembed" + "nchw/im2col.cl", +#include "./cl_kernels/nchw/im2col.clembed" + }, + { + "nhwc/im2col.cl", +#include "./cl_kernels/nhwc/im2col.clembed" + }, + { + "common/instance_normalization.cl", +#include "./cl_kernels/common/instance_normalization.clembed" + }, + { + "common/l2_normalize.cl", +#include "./cl_kernels/common/l2_normalize.clembed" + }, + { + "common/mean_stddev_normalization.cl", +#include "./cl_kernels/common/mean_stddev_normalization.clembed" + }, + { + "common/memset.cl", +#include "./cl_kernels/common/memset.clembed" + }, + { + "common/minmax_layer.cl", +#include "./cl_kernels/common/minmax_layer.clembed" }, { - "instance_normalization.cl", -#include "./cl_kernels/instance_normalization.clembed" + "common/nonmax.cl", +#include "./cl_kernels/common/nonmax.clembed" }, { - "l2_normalize.cl", -#include "./cl_kernels/l2_normalize.clembed" + "nchw/normalization_layer.cl", +#include "./cl_kernels/nchw/normalization_layer.clembed" }, { - "mean_stddev_normalization.cl", -#include "./cl_kernels/mean_stddev_normalization.clembed" + "nhwc/normalization_layer.cl", +#include "./cl_kernels/nhwc/normalization_layer.clembed" }, { - "memset.cl", -#include "./cl_kernels/memset.clembed" + "nchw/normalize_planar_yuv_layer.cl", +#include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed" }, { - "minmax_layer.cl", -#include "./cl_kernels/minmax_layer.clembed" + "nhwc/normalize_planar_yuv_layer.cl", +#include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed" }, { - "nonmax.cl", -#include "./cl_kernels/nonmax.clembed" + "nchw/normalize_planar_yuv_layer_quantized.cl", +#include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed" }, { - "normalization_layer.cl", -#include "./cl_kernels/normalization_layer.clembed" + "nhwc/normalize_planar_yuv_layer_quantized.cl", +#include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed" }, { - "normalize_planar_yuv_layer.cl", -#include "./cl_kernels/normalize_planar_yuv_layer.clembed" + "common/batchnormalization_layer.cl", +#include "./cl_kernels/common/batchnormalization_layer.clembed" }, { - "normalize_planar_yuv_layer_quantized.cl", -#include "./cl_kernels/normalize_planar_yuv_layer_quantized.clembed" + "nchw/batchnormalization_layer.cl", +#include "./cl_kernels/nchw/batchnormalization_layer.clembed" }, { - "batchnormalization_layer.cl", -#include "./cl_kernels/batchnormalization_layer.clembed" + "nhwc/batchnormalization_layer.cl", +#include "./cl_kernels/nhwc/batchnormalization_layer.clembed" }, { - "pad_layer.cl", -#include "./cl_kernels/pad_layer.clembed" + "common/pad_layer.cl", +#include "./cl_kernels/common/pad_layer.clembed" }, { - "permute.cl", -#include "./cl_kernels/permute.clembed" + "common/permute.cl", +#include "./cl_kernels/common/permute.clembed" }, { - "pixelwise_mul_float.cl", -#include "./cl_kernels/pixelwise_mul_float.clembed" + "common/pixelwise_mul_float.cl", +#include "./cl_kernels/common/pixelwise_mul_float.clembed" }, { - "pixelwise_mul_int.cl", -#include "./cl_kernels/pixelwise_mul_int.clembed" + "common/pixelwise_mul_int.cl", +#include "./cl_kernels/common/pixelwise_mul_int.clembed" }, { - "pooling_layer.cl", -#include "./cl_kernels/pooling_layer.clembed" + "common/pooling_layer.cl", +#include "./cl_kernels/common/pooling_layer.clembed" }, { - "pooling_layer_quantized.cl", -#include "./cl_kernels/pooling_layer_quantized.clembed" + "nchw/pooling_layer.cl", +#include "./cl_kernels/nchw/pooling_layer.clembed" }, { - "prior_box_layer.cl", -#include "./cl_kernels/prior_box_layer.clembed" + "nhwc/pooling_layer.cl", +#include "./cl_kernels/nhwc/pooling_layer.clembed" }, { - "qlstm_layer_normalization.cl", -#include "./cl_kernels/qlstm_layer_normalization.clembed" + "nchw/pooling_layer_quantized.cl", +#include "./cl_kernels/nchw/pooling_layer_quantized.clembed" }, { - "quantization_layer.cl", -#include "./cl_kernels/quantization_layer.clembed" + "nhwc/pooling_layer_quantized.cl", +#include "./cl_kernels/nhwc/pooling_layer_quantized.clembed" }, { - "range.cl", -#include "./cl_kernels/range.clembed" + "nchw/prior_box_layer.cl", +#include "./cl_kernels/nchw/prior_box_layer.clembed" }, { - "reduction_operation.cl", -#include "./cl_kernels/reduction_operation.clembed" + "common/qlstm_layer_normalization.cl", +#include "./cl_kernels/common/qlstm_layer_normalization.clembed" }, { - "remap.cl", -#include "./cl_kernels/remap.clembed" + "common/quantization_layer.cl", +#include "./cl_kernels/common/quantization_layer.clembed" }, { - "reorg_layer.cl", -#include "./cl_kernels/reorg_layer.clembed" + "common/range.cl", +#include "./cl_kernels/common/range.clembed" }, { - "reshape_layer.cl", -#include "./cl_kernels/reshape_layer.clembed" + "common/reduction_operation.cl", +#include "./cl_kernels/common/reduction_operation.clembed" }, { - "reverse.cl", -#include "./cl_kernels/reverse.clembed" + "nchw/remap.cl", +#include "./cl_kernels/nchw/remap.clembed" }, { - "roi_align_layer.cl", -#include "./cl_kernels/roi_align_layer.clembed" + "nhwc/remap.cl", +#include "./cl_kernels/nhwc/remap.clembed" }, { - "roi_align_layer_quantized.cl", -#include "./cl_kernels/roi_align_layer_quantized.clembed" + "nchw/reorg_layer.cl", +#include "./cl_kernels/nchw/reorg_layer.clembed" }, { - "roi_pooling_layer.cl", -#include "./cl_kernels/roi_pooling_layer.clembed" + "nhwc/reorg_layer.cl", +#include "./cl_kernels/nhwc/reorg_layer.clembed" }, { - "scale.cl", -#include "./cl_kernels/scale.clembed" + "common/reshape_layer.cl", +#include "./cl_kernels/common/reshape_layer.clembed" }, { - "scale_quantized.cl", -#include "./cl_kernels/scale_quantized.clembed" + "common/reverse.cl", +#include "./cl_kernels/common/reverse.clembed" }, { - "select.cl", -#include "./cl_kernels/select.clembed" + "common/roi_align_layer.cl", +#include "./cl_kernels/common/roi_align_layer.clembed" }, { - "softmax_layer.cl", -#include "./cl_kernels/softmax_layer.clembed" + "common/roi_align_layer_quantized.cl", +#include "./cl_kernels/common/roi_align_layer_quantized.clembed" }, { - "softmax_layer_quantized.cl", -#include "./cl_kernels/softmax_layer_quantized.clembed" + "common/roi_pooling_layer.cl", +#include "./cl_kernels/common/roi_pooling_layer.clembed" }, { - "slice_ops.cl", -#include "./cl_kernels/slice_ops.clembed" + "nchw/scale.cl", +#include "./cl_kernels/nchw/scale.clembed" }, { - "space_to_batch.cl", -#include "./cl_kernels/space_to_batch.clembed" + "nhwc/scale.cl", +#include "./cl_kernels/nhwc/scale.clembed" }, { - "space_to_depth.cl", -#include "./cl_kernels/space_to_depth.clembed" + "nchw/scale_quantized.cl", +#include "./cl_kernels/nchw/scale_quantized.clembed" }, { - "stack_layer.cl", -#include "./cl_kernels/stack_layer.clembed" + "nhwc/scale_quantized.cl", +#include "./cl_kernels/nhwc/scale_quantized.clembed" }, { - "tile.cl", -#include "./cl_kernels/tile.clembed" + "common/select.cl", +#include "./cl_kernels/common/select.clembed" }, { - "transpose.cl", -#include "./cl_kernels/transpose.clembed" + "common/softmax_layer.cl", +#include "./cl_kernels/common/softmax_layer.clembed" + }, + { + "common/softmax_layer_quantized.cl", +#include "./cl_kernels/common/softmax_layer_quantized.clembed" + }, + { + "common/slice_ops.cl", +#include "./cl_kernels/common/slice_ops.clembed" + }, + { + "nchw/space_to_batch.cl", +#include "./cl_kernels/nchw/space_to_batch.clembed" + }, + { + "nhwc/space_to_batch.cl", +#include "./cl_kernels/nhwc/space_to_batch.clembed" + }, + { + "nchw/space_to_depth.cl", +#include "./cl_kernels/nchw/space_to_depth.clembed" + }, + { + "nhwc/space_to_depth.cl", +#include "./cl_kernels/nhwc/space_to_depth.clembed" + }, + { + "common/stack_layer.cl", +#include "./cl_kernels/common/stack_layer.clembed" + }, + { + "common/tile.cl", +#include "./cl_kernels/common/tile.clembed" + }, + { + "common/transpose.cl", +#include "./cl_kernels/common/transpose.clembed" }, { "types.h", #include "./cl_kernels/types.hembed" }, { - "unpooling_layer.cl", -#include "./cl_kernels/unpooling_layer.clembed" + "common/unpooling_layer.cl", +#include "./cl_kernels/common/unpooling_layer.clembed" + }, + { + "nchw/winograd_filter_transform.cl", +#include "./cl_kernels/nchw/winograd_filter_transform.clembed" + }, + { + "nhwc/winograd_filter_transform.cl", +#include "./cl_kernels/nhwc/winograd_filter_transform.clembed" + }, + { + "nchw/winograd_input_transform.cl", +#include "./cl_kernels/nchw/winograd_input_transform.clembed" }, { - "winograd_filter_transform.cl", -#include "./cl_kernels/winograd_filter_transform.clembed" + "nhwc/winograd_input_transform.cl", +#include "./cl_kernels/nhwc/winograd_input_transform.clembed" }, { - "winograd_input_transform.cl", -#include "./cl_kernels/winograd_input_transform.clembed" + "nchw/winograd_output_transform.cl", +#include "./cl_kernels/nchw/winograd_output_transform.clembed" }, { - "winograd_output_transform.cl", -#include "./cl_kernels/winograd_output_transform.clembed" + "nhwc/winograd_output_transform.cl", +#include "./cl_kernels/nhwc/winograd_output_transform.clembed" }, #endif /* EMBEDDED_KERNELS */ }; |