aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
authorAdnan AlSinan <adnan.alsinan@arm.com>2021-07-05 13:12:52 +0100
committerGeorgios Pinitas <georgios.pinitas@arm.com>2021-07-25 13:04:23 +0000
commit7075fe2c5ee6f7cfe7cfd9454d905235e70b9ac4 (patch)
treeb65671bdf37eb1ef8cc30ef64ab572da795546fa
parent22f5ed51f1b01f7cf6993a556a0b763e437926fc (diff)
downloadComputeLibrary-7075fe2c5ee6f7cfe7cfd9454d905235e70b9ac4.tar.gz
Reorganize the kernels into nhwc, nchw and common folders
The Following kernels have been split into nchw/nhwc kernels files: - batchnormalization_layer - batch_to_space - channel_shuffle - depth_to_space - dequantization_layer - im2col - normalization_layer - normalize_planar_yuv_layer - normalize_planar_yuv_layer_quantized - pooling_layer - pooling_layer_quantized - remap - reorg_layer - scale - scale_quantized - space_to_batch - space_to_depth - upsample_layer - winograd_filter_transform - winograd_input_transform - winograd_output_transform The following kernels have been moved to nchw folder: - direct_convolution1x1 - direct_convolution3x3 - direct_convolution5x5 - direct_convolution_quantized - prior_box_layer The following kernels have been moved to nhwc folder: - direct_convolution - dwc_native_fp_nhwc - dwc_native_quantized_nhwc The following kernels have been removed: - sobel_filter While the rest kerenls have been moved to the common folder. Partially resolves COMPMID-4453 Signed-off-by: Adnan AlSinan <adnan.alsinan@arm.com> Change-Id: Ic327ac935687ec351c610c65a3c6357f364a5a58 Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/5919 Tested-by: Arm Jenkins <bsgcomp@arm.com> Reviewed-by: Georgios Pinitas <georgios.pinitas@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com>
-rw-r--r--SConscript141
-rw-r--r--docs/Doxyfile2
-rw-r--r--src/core/CL/cl_kernels/batch_to_space.cl232
-rw-r--r--src/core/CL/cl_kernels/batchnormalization_layer.cl418
-rw-r--r--src/core/CL/cl_kernels/common/activation_layer.cl (renamed from src/core/CL/cl_kernels/activation_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/activation_layer_quant.cl (renamed from src/core/CL/cl_kernels/activation_layer_quant.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/arg_min_max.cl (renamed from src/core/CL/cl_kernels/arg_min_max.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/batchnormalization_layer.cl183
-rw-r--r--src/core/CL/cl_kernels/common/bitwise_op.cl (renamed from src/core/CL/cl_kernels/bitwise_op.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/bounding_box_transform.cl (renamed from src/core/CL/cl_kernels/bounding_box_transform.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl (renamed from src/core/CL/cl_kernels/bounding_box_transform_quantized.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/cast.cl (renamed from src/core/CL/cl_kernels/cast.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/col2im.cl (renamed from src/core/CL/cl_kernels/col2im.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/comparisons.cl (renamed from src/core/CL/cl_kernels/comparisons.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/concatenate.cl (renamed from src/core/CL/cl_kernels/concatenate.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/convert_fc_weights.cl (renamed from src/core/CL/cl_kernels/convert_fc_weights.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/convolution_layer.cl (renamed from src/core/CL/cl_kernels/convolution_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/copy_tensor.cl (renamed from src/core/CL/cl_kernels/copy_tensor.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/crop_tensor.cl (renamed from src/core/CL/cl_kernels/crop_tensor.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/deconvolution_layer.cl (renamed from src/core/CL/cl_kernels/deconvolution_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/dequantization_layer.cl90
-rw-r--r--src/core/CL/cl_kernels/common/elementwise_operation.cl (renamed from src/core/CL/cl_kernels/elementwise_operation.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl (renamed from src/core/CL/cl_kernels/elementwise_operation_quantized.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/elementwise_unary.cl (renamed from src/core/CL/cl_kernels/elementwise_unary.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/fft.cl (renamed from src/core/CL/cl_kernels/fft.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/fft_digit_reverse.cl (renamed from src/core/CL/cl_kernels/fft_digit_reverse.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/fft_scale.cl (renamed from src/core/CL/cl_kernels/fft_scale.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/fill_border.cl (renamed from src/core/CL/cl_kernels/fill_border.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/floor.cl (renamed from src/core/CL/cl_kernels/floor.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/gather.cl (renamed from src/core/CL/cl_kernels/gather.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/gemm.cl (renamed from src/core/CL/cl_kernels/gemm.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/gemm_v1.cl (renamed from src/core/CL/cl_kernels/gemm_v1.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/gemmlowp.cl (renamed from src/core/CL/cl_kernels/gemmlowp.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/gemv.cl (renamed from src/core/CL/cl_kernels/gemv.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/generate_proposals.cl (renamed from src/core/CL/cl_kernels/generate_proposals.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/generate_proposals_quantized.cl (renamed from src/core/CL/cl_kernels/generate_proposals_quantized.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/instance_normalization.cl (renamed from src/core/CL/cl_kernels/instance_normalization.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/l2_normalize.cl (renamed from src/core/CL/cl_kernels/l2_normalize.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/mean_stddev_normalization.cl (renamed from src/core/CL/cl_kernels/mean_stddev_normalization.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/memset.cl (renamed from src/core/CL/cl_kernels/memset.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/minmax_layer.cl (renamed from src/core/CL/cl_kernels/minmax_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/nonmax.cl (renamed from src/core/CL/cl_kernels/nonmax.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/pad_layer.cl (renamed from src/core/CL/cl_kernels/pad_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/permute.cl (renamed from src/core/CL/cl_kernels/permute.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/pixelwise_mul_float.cl (renamed from src/core/CL/cl_kernels/pixelwise_mul_float.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/pixelwise_mul_int.cl (renamed from src/core/CL/cl_kernels/pixelwise_mul_int.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/pooling_layer.cl390
-rw-r--r--src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl (renamed from src/core/CL/cl_kernels/qlstm_layer_normalization.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/quantization_layer.cl (renamed from src/core/CL/cl_kernels/quantization_layer.cl)6
-rw-r--r--src/core/CL/cl_kernels/common/range.cl (renamed from src/core/CL/cl_kernels/range.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/reduction_operation.cl (renamed from src/core/CL/cl_kernels/reduction_operation.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/reshape_layer.cl (renamed from src/core/CL/cl_kernels/reshape_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/reverse.cl (renamed from src/core/CL/cl_kernels/reverse.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/roi_align_layer.cl (renamed from src/core/CL/cl_kernels/roi_align_layer.cl)6
-rw-r--r--src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl (renamed from src/core/CL/cl_kernels/roi_align_layer_quantized.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/roi_pooling_layer.cl (renamed from src/core/CL/cl_kernels/roi_pooling_layer.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/select.cl (renamed from src/core/CL/cl_kernels/select.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/slice_ops.cl (renamed from src/core/CL/cl_kernels/slice_ops.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/softmax_layer.cl (renamed from src/core/CL/cl_kernels/softmax_layer.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/softmax_layer_quantized.cl (renamed from src/core/CL/cl_kernels/softmax_layer_quantized.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/stack_layer.cl (renamed from src/core/CL/cl_kernels/stack_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/tile.cl (renamed from src/core/CL/cl_kernels/tile.cl)2
-rw-r--r--src/core/CL/cl_kernels/common/transpose.cl (renamed from src/core/CL/cl_kernels/transpose.cl)0
-rw-r--r--src/core/CL/cl_kernels/common/unpooling_layer.cl (renamed from src/core/CL/cl_kernels/unpooling_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/dequantization_layer.cl212
-rw-r--r--src/core/CL/cl_kernels/nchw/batch_to_space.cl (renamed from src/core/CL/cl_kernels/depth_to_space.cl)74
-rw-r--r--src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl147
-rw-r--r--src/core/CL/cl_kernels/nchw/channel_shuffle.cl103
-rw-r--r--src/core/CL/cl_kernels/nchw/depth_to_space.cl69
-rw-r--r--src/core/CL/cl_kernels/nchw/dequantization_layer.cl86
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl (renamed from src/core/CL/cl_kernels/direct_convolution1x1.cl)0
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl (renamed from src/core/CL/cl_kernels/direct_convolution3x3.cl)0
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl (renamed from src/core/CL/cl_kernels/direct_convolution5x5.cl)0
-rw-r--r--src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl (renamed from src/core/CL/cl_kernels/direct_convolution_quantized.cl)0
-rw-r--r--src/core/CL/cl_kernels/nchw/im2col.cl (renamed from src/core/CL/cl_kernels/im2col.cl)501
-rw-r--r--src/core/CL/cl_kernels/nchw/normalization_layer.cl175
-rw-r--r--src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl82
-rw-r--r--src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl101
-rw-r--r--src/core/CL/cl_kernels/nchw/pooling_layer.cl331
-rw-r--r--src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl142
-rw-r--r--src/core/CL/cl_kernels/nchw/prior_box_layer.cl (renamed from src/core/CL/cl_kernels/prior_box_layer.cl)2
-rw-r--r--src/core/CL/cl_kernels/nchw/remap.cl133
-rw-r--r--src/core/CL/cl_kernels/nchw/reorg_layer.cl (renamed from src/core/CL/cl_kernels/reorg_layer.cl)43
-rw-r--r--src/core/CL/cl_kernels/nchw/scale.cl148
-rw-r--r--src/core/CL/cl_kernels/nchw/scale_quantized.cl86
-rw-r--r--src/core/CL/cl_kernels/nchw/space_to_batch.cl (renamed from src/core/CL/cl_kernels/space_to_batch.cl)126
-rw-r--r--src/core/CL/cl_kernels/nchw/space_to_depth.cl69
-rw-r--r--src/core/CL/cl_kernels/nchw/upsample_layer.cl79
-rw-r--r--src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl911
-rw-r--r--src/core/CL/cl_kernels/nchw/winograd_input_transform.cl (renamed from src/core/CL/cl_kernels/winograd_input_transform.cl)887
-rw-r--r--src/core/CL/cl_kernels/nchw/winograd_output_transform.cl (renamed from src/core/CL/cl_kernels/winograd_output_transform.cl)981
-rw-r--r--src/core/CL/cl_kernels/nhwc/batch_to_space.cl (renamed from src/core/CL/cl_kernels/space_to_depth.cl)82
-rw-r--r--src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl146
-rw-r--r--src/core/CL/cl_kernels/nhwc/channel_shuffle.cl (renamed from src/core/CL/cl_kernels/channel_shuffle.cl)64
-rw-r--r--src/core/CL/cl_kernels/nhwc/depth_to_space.cl69
-rw-r--r--src/core/CL/cl_kernels/nhwc/dequantization_layer.cl87
-rw-r--r--src/core/CL/cl_kernels/nhwc/direct_convolution.cl (renamed from src/core/CL/cl_kernels/direct_convolution.cl)0
-rw-r--r--src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl (renamed from src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl)44
-rw-r--r--src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl (renamed from src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl)76
-rw-r--r--src/core/CL/cl_kernels/nhwc/im2col.cl532
-rw-r--r--src/core/CL/cl_kernels/nhwc/normalization_layer.cl (renamed from src/core/CL/cl_kernels/normalization_layer.cl)154
-rw-r--r--src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl (renamed from src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl)55
-rw-r--r--src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl (renamed from src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl)72
-rw-r--r--src/core/CL/cl_kernels/nhwc/pooling_layer.cl364
-rw-r--r--src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl (renamed from src/core/CL/cl_kernels/pooling_layer_quantized.cl)104
-rw-r--r--src/core/CL/cl_kernels/nhwc/remap.cl (renamed from src/core/CL/cl_kernels/remap.cl)112
-rw-r--r--src/core/CL/cl_kernels/nhwc/reorg_layer.cl76
-rw-r--r--src/core/CL/cl_kernels/nhwc/scale.cl (renamed from src/core/CL/cl_kernels/scale.cl)141
-rw-r--r--src/core/CL/cl_kernels/nhwc/scale_quantized.cl (renamed from src/core/CL/cl_kernels/scale_quantized.cl)81
-rw-r--r--src/core/CL/cl_kernels/nhwc/space_to_batch.cl155
-rw-r--r--src/core/CL/cl_kernels/nhwc/space_to_depth.cl69
-rw-r--r--src/core/CL/cl_kernels/nhwc/upsample_layer.cl (renamed from src/core/CL/cl_kernels/upsample_layer.cl)59
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl (renamed from src/core/CL/cl_kernels/winograd_filter_transform.cl)881
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl953
-rw-r--r--src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl1030
-rw-r--r--src/core/CL/cl_kernels/pooling_layer.cl981
-rw-r--r--src/core/CL/cl_kernels/sobel_filter.cl541
-rw-r--r--src/core/gpu/cl/ClKernelLibrary.cpp1052
118 files changed, 7755 insertions, 7245 deletions
diff --git a/SConscript b/SConscript
index 886ad083ad..682f55310e 100644
--- a/SConscript
+++ b/SConscript
@@ -228,10 +228,143 @@ cpp_compiler = os.environ.get('CXX', default_cpp_compiler)
# Generate embed files
generate_embed = [ version_file ]
if env['opencl'] and env['embed_kernels']:
- cl_files = Glob('src/core/CL/cl_kernels/*.cl')
- cl_files += Glob('src/core/CL/cl_kernels/*.h')
-
- embed_files = [ f.get_path()+"embed" for f in cl_files ]
+
+ # Header files
+ cl_helper_files = [ 'src/core/CL/cl_kernels/activation_float_helpers.h',
+ 'src/core/CL/cl_kernels/activation_quant_helpers.h',
+ 'src/core/CL/cl_kernels/gemm_helpers.h',
+ 'src/core/CL/cl_kernels/helpers_asymm.h',
+ 'src/core/CL/cl_kernels/helpers.h',
+ 'src/core/CL/cl_kernels/load_store_utility.h',
+ 'src/core/CL/cl_kernels/repeat.h',
+ 'src/core/CL/cl_kernels/tile_helpers.h',
+ 'src/core/CL/cl_kernels/types.h',
+ 'src/core/CL/cl_kernels/warp_helpers_quantized.h',
+ 'src/core/CL/cl_kernels/warp_helpers.h'
+ ]
+
+ # Common kernels
+ cl_files_common = ['src/core/CL/cl_kernels/common/activation_layer.cl',
+ 'src/core/CL/cl_kernels/common/activation_layer_quant.cl',
+ 'src/core/CL/cl_kernels/common/arg_min_max.cl',
+ 'src/core/CL/cl_kernels/common/batchnormalization_layer.cl',
+ 'src/core/CL/cl_kernels/common/bounding_box_transform.cl',
+ 'src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl',
+ 'src/core/CL/cl_kernels/common/bitwise_op.cl',
+ 'src/core/CL/cl_kernels/common/cast.cl',
+ 'src/core/CL/cl_kernels/common/comparisons.cl',
+ 'src/core/CL/cl_kernels/common/concatenate.cl',
+ 'src/core/CL/cl_kernels/common/col2im.cl',
+ 'src/core/CL/cl_kernels/common/convert_fc_weights.cl',
+ 'src/core/CL/cl_kernels/common/copy_tensor.cl',
+ 'src/core/CL/cl_kernels/common/crop_tensor.cl',
+ 'src/core/CL/cl_kernels/common/deconvolution_layer.cl',
+ 'src/core/CL/cl_kernels/common/dequantization_layer.cl',
+ 'src/core/CL/cl_kernels/common/elementwise_operation.cl',
+ 'src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl',
+ 'src/core/CL/cl_kernels/common/elementwise_unary.cl',
+ 'src/core/CL/cl_kernels/common/fft_digit_reverse.cl',
+ 'src/core/CL/cl_kernels/common/fft.cl',
+ 'src/core/CL/cl_kernels/common/fft_scale.cl',
+ 'src/core/CL/cl_kernels/common/fill_border.cl',
+ 'src/core/CL/cl_kernels/common/floor.cl',
+ 'src/core/CL/cl_kernels/common/gather.cl',
+ 'src/core/CL/cl_kernels/common/gemm.cl',
+ 'src/core/CL/cl_kernels/common/gemv.cl',
+ 'src/core/CL/cl_kernels/common/gemm_v1.cl',
+ 'src/core/CL/cl_kernels/common/gemmlowp.cl',
+ 'src/core/CL/cl_kernels/common/generate_proposals.cl',
+ 'src/core/CL/cl_kernels/common/generate_proposals_quantized.cl',
+ 'src/core/CL/cl_kernels/common/instance_normalization.cl',
+ 'src/core/CL/cl_kernels/common/l2_normalize.cl',
+ 'src/core/CL/cl_kernels/common/mean_stddev_normalization.cl',
+ 'src/core/CL/cl_kernels/common/unpooling_layer.cl',
+ 'src/core/CL/cl_kernels/common/memset.cl',
+ 'src/core/CL/cl_kernels/common/nonmax.cl',
+ 'src/core/CL/cl_kernels/common/minmax_layer.cl',
+ 'src/core/CL/cl_kernels/common/pad_layer.cl',
+ 'src/core/CL/cl_kernels/common/permute.cl',
+ 'src/core/CL/cl_kernels/common/pixelwise_mul_float.cl',
+ 'src/core/CL/cl_kernels/common/pixelwise_mul_int.cl',
+ 'src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl',
+ 'src/core/CL/cl_kernels/common/quantization_layer.cl',
+ 'src/core/CL/cl_kernels/common/range.cl',
+ 'src/core/CL/cl_kernels/common/reduction_operation.cl',
+ 'src/core/CL/cl_kernels/common/pooling_layer.cl',
+ 'src/core/CL/cl_kernels/common/reshape_layer.cl',
+ 'src/core/CL/cl_kernels/common/convolution_layer.cl',
+ 'src/core/CL/cl_kernels/common/reverse.cl',
+ 'src/core/CL/cl_kernels/common/roi_align_layer.cl',
+ 'src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl',
+ 'src/core/CL/cl_kernels/common/roi_pooling_layer.cl',
+ 'src/core/CL/cl_kernels/common/select.cl',
+ 'src/core/CL/cl_kernels/common/softmax_layer.cl',
+ 'src/core/CL/cl_kernels/common/softmax_layer_quantized.cl',
+ 'src/core/CL/cl_kernels/common/stack_layer.cl',
+ 'src/core/CL/cl_kernels/common/slice_ops.cl',
+ 'src/core/CL/cl_kernels/common/tile.cl',
+ 'src/core/CL/cl_kernels/common/transpose.cl'
+ ]
+
+ # NCHW kernels
+ cl_files_nchw = ['src/core/CL/cl_kernels/nchw/batch_to_space.cl',
+ 'src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/channel_shuffle.cl',
+ 'src/core/CL/cl_kernels/nchw/depth_to_space.cl',
+ 'src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl',
+ 'src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl',
+ 'src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl',
+ 'src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl',
+ 'src/core/CL/cl_kernels/nchw/dequantization_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/im2col.cl',
+ 'src/core/CL/cl_kernels/nchw/normalization_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl',
+ 'src/core/CL/cl_kernels/nchw/pooling_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl',
+ 'src/core/CL/cl_kernels/nchw/prior_box_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/remap.cl',
+ 'src/core/CL/cl_kernels/nchw/reorg_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/scale.cl',
+ 'src/core/CL/cl_kernels/nchw/scale_quantized.cl',
+ 'src/core/CL/cl_kernels/nchw/space_to_batch.cl',
+ 'src/core/CL/cl_kernels/nchw/space_to_depth.cl',
+ 'src/core/CL/cl_kernels/nchw/upsample_layer.cl',
+ 'src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl',
+ 'src/core/CL/cl_kernels/nchw/winograd_input_transform.cl',
+ 'src/core/CL/cl_kernels/nchw/winograd_output_transform.cl'
+ ]
+
+ # NHWC kernels
+ cl_files_nhwc = ['src/core/CL/cl_kernels/nhwc/batch_to_space.cl',
+ 'src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl',
+ 'src/core/CL/cl_kernels/nhwc/channel_shuffle.cl',
+ 'src/core/CL/cl_kernels/nhwc/direct_convolution.cl',
+ 'src/core/CL/cl_kernels/nhwc/depth_to_space.cl',
+ 'src/core/CL/cl_kernels/nhwc/dequantization_layer.cl',
+ 'src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl',
+ 'src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl',
+ 'src/core/CL/cl_kernels/nhwc/im2col.cl',
+ 'src/core/CL/cl_kernels/nhwc/normalization_layer.cl',
+ 'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl',
+ 'src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl',
+ 'src/core/CL/cl_kernels/nhwc/pooling_layer.cl',
+ 'src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl',
+ 'src/core/CL/cl_kernels/nhwc/remap.cl',
+ 'src/core/CL/cl_kernels/nhwc/reorg_layer.cl',
+ 'src/core/CL/cl_kernels/nhwc/scale.cl',
+ 'src/core/CL/cl_kernels/nhwc/scale_quantized.cl',
+ 'src/core/CL/cl_kernels/nhwc/space_to_batch.cl',
+ 'src/core/CL/cl_kernels/nhwc/space_to_depth.cl',
+ 'src/core/CL/cl_kernels/nhwc/upsample_layer.cl',
+ 'src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl',
+ 'src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl',
+ 'src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl'
+ ]
+
+ cl_files = cl_helper_files + cl_files_common + cl_files_nchw + cl_files_nhwc
+
+ embed_files = [ f+"embed" for f in cl_files ]
arm_compute_env.Append(CPPPATH =[Dir("./src/core/CL/").path] )
generate_embed.append(arm_compute_env.Command(embed_files, cl_files, action=resolve_includes))
diff --git a/docs/Doxyfile b/docs/Doxyfile
index 27e28618b9..f70ec931eb 100644
--- a/docs/Doxyfile
+++ b/docs/Doxyfile
@@ -2068,7 +2068,7 @@ SEARCH_INCLUDES = YES
# preprocessor.
# This tag requires that the tag SEARCH_INCLUDES is set to YES.
-INCLUDE_PATH =
+INCLUDE_PATH = ./src/core/CL/cl_kernels/
# You can use the INCLUDE_FILE_PATTERNS tag to specify one or more wildcard
# patterns (like *.h and *.hpp) to filter out the header-files in the
diff --git a/src/core/CL/cl_kernels/batch_to_space.cl b/src/core/CL/cl_kernels/batch_to_space.cl
deleted file mode 100644
index 8a71985b02..0000000000
--- a/src/core/CL/cl_kernels/batch_to_space.cl
+++ /dev/null
@@ -1,232 +0,0 @@
-/*
- * Copyright (c) 2018-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] batch_id The input tensor batch id
- * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
- * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nchw(
- TENSOR3D_DECLARATION(input),
- const int batch_id,
- VECTOR_DECLARATION(block_shape),
- TENSOR4D_DECLARATION(output))
-{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
- Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
- const int block_x = *((__global int *)vector_offset(&block, 0));
- const int block_y = *((__global int *)vector_offset(&block, 1));
-
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int x = get_global_id(0);
- const int y = get_global_id(1);
- const int z = get_global_id(2);
- const int w = batch_id % r;
-
- const int out_x = x * block_x + (batch_id / r) % block_x;
- const int out_y = y * block_y + (batch_id / r) / block_x;
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] batch_id The input tensor batch id
- * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
- * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_nhwc(
- TENSOR3D_DECLARATION(input),
- const int batch_id,
- VECTOR_DECLARATION(block_shape),
- TENSOR4D_DECLARATION(output))
-{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
- Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
-
- const int block_x = *((__global int *)vector_offset(&block, 0));
- const int block_y = *((__global int *)vector_offset(&block, 1));
-
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int x = get_global_id(1);
- const int y = get_global_id(2);
- const int z = get_global_id(0);
- const int w = batch_id % r;
-
- const int out_x = x * block_x + (batch_id / r) % block_x;
- const int out_y = y * block_y + (batch_id / r) / block_x;
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
-
-#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
-/** Batch to space transformation. (NCHW)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] batch_id The input tensor batch id
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nchw(
- TENSOR3D_DECLARATION(input),
- const int batch_id,
- TENSOR4D_DECLARATION(output))
-{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- const int block_x = BLOCK_SHAPE_X;
- const int block_y = BLOCK_SHAPE_Y;
-
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int x = get_global_id(0);
- const int y = get_global_id(1);
- const int z = get_global_id(2);
- const int w = batch_id % r;
-
- const int out_x = x * block_x + (batch_id / r) % block_x;
- const int out_y = y * block_y + (batch_id / r) / block_x;
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-/** Batch to space transformation. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[in] batch_id The input tensor batch id
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void batch_to_space_static_nhwc(
- TENSOR3D_DECLARATION(input),
- const int batch_id,
- TENSOR4D_DECLARATION(output))
-{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
-
- const int block_x = BLOCK_SHAPE_X;
- const int block_y = BLOCK_SHAPE_Y;
-
- const int r = (BATCH_SIZE / (block_x * block_y));
- const int x = get_global_id(1);
- const int y = get_global_id(2);
- const int z = get_global_id(0);
- const int w = batch_id % r;
-
- const int out_x = x * block_x + (batch_id / r) % block_x;
- const int out_y = y * block_y + (batch_id / r) / block_x;
-
- *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
-}
-#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/batchnormalization_layer.cl b/src/core/CL/cl_kernels/batchnormalization_layer.cl
deleted file mode 100644
index 89cbe4440e..0000000000
--- a/src/core/CL/cl_kernels/batchnormalization_layer.cl
+++ /dev/null
@@ -1,418 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#define ADD_OP(a, b) ((a) + (b))
-#define SUB_OP(a, b) ((a) - (b))
-#define MUL_OP(a, b) ((a) * (b))
-#define INVSQRT_OP(a) rsqrt((a))
-#define SQCVT_SAT(a) (a)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
-#include "activation_float_helpers.h"
-
-/** Apply batch normalization.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
- * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
- * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
- * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
- * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
- * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
- * @param[in] epsilon Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
- TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
- VECTOR_DECLARATION(mean),
- VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
- VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
- VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
- float epsilon)
-{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
-#ifdef IN_PLACE
- Tensor3D out = in;
-#else /* IN_PLACE */
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-#endif /* IN_PLACE */
- Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
- Vector var = CONVERT_TO_VECTOR_STRUCT(var);
-#ifndef USE_DEFAULT_BETA
- Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
- Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
-#endif /* USE_DEFAULT_GAMMA */
-
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- data = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- denominator = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- numerator = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- x_bar = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- res = 0;
-
- const int current_slice = get_global_id(2);
-
- data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
- denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
- denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
- // Calculate x bar and store results
- numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
- numerator = SUB_OP(data, numerator);
- x_bar = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
-
- res = MUL_OP(gamma_vec, x_bar);
-#else /* USE_DEFAULT_GAMMA */
- // gamma is equal to 1, no need to perform multiplications
- res = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
- // beta is not zero, hence we need to perform the addition
- res = ADD_OP(res, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
- res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
-
- VSTORE(VEC_SIZE)
- (res, 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Apply batch normalization on tensors with NHWC format.
- *
- * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
- * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
- *
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
- * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
- * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
- * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
- * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
- * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
- * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
- * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
- * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
- * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
- * @param[in] epsilon Epsilon parameter in the batch normalization equation
- */
-__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
-#ifndef IN_PLACE
- TENSOR3D_DECLARATION(output),
-#endif /* not IN_PLACE */
- VECTOR_DECLARATION(mean),
- VECTOR_DECLARATION(var),
-#ifndef USE_DEFAULT_BETA
- VECTOR_DECLARATION(beta),
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
- VECTOR_DECLARATION(gamma),
-#endif /* USE_DEFAULT_GAMMA */
- float epsilon)
-{
- uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
-
- __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
-#ifdef IN_PLACE
- __global uchar *output_addr = input_ptr;
-#else /* IN_PLACE */
- __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
-#endif /* IN_PLACE */
- __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
- __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs;
-#ifndef USE_DEFAULT_BETA
- __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_BETA */
-#ifndef USE_DEFAULT_GAMMA
- __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
-#endif /* USE_DEFAULT_GAMMA */
-
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- data = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- denominator = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- numerator = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- x_bar = 0;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- res0 = 0;
-
- data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
- denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
- denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
-
- // Calculate x bar and store results
- numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
- numerator = SUB_OP(data, numerator);
- x_bar = MUL_OP(numerator, denominator);
-
-#ifndef USE_DEFAULT_GAMMA
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
-
- res0 = MUL_OP(gamma_vec, x_bar);
-#else /* USE_DEFAULT_GAMMA */
- // gamma is equal to 1, no need to perform multiplications
- res0 = x_bar;
-#endif /* USE_DEFAULT_GAMMA */
-
-#ifndef USE_DEFAULT_BETA
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
- // beta is not zero, hence we need to perform the addition
- res0 = ADD_OP(res0, beta_vec);
-#endif /* USE_DEFAULT_BETA */
-
- res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
-
- STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
-}
-#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/
-
-#if defined(DATA_TYPE) && defined(EPSILON)
-/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
- *
- * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
- * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
- * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
- * For depthwise convolution weight do not pass DIM2
- * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
- * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
- *
- * @param[in] w_ptr Pointer to the weights tensor. Supported data types: F16/F32
- * @param[in] w_stride_x Stride of the weights tensor in X dimension (in bytes)
- * @param[in] w_step_x w_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] w_stride_y Stride of the weights tensor in Y dimension (in bytes)
- * @param[in] w_step_y w_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] w_stride_z Stride of the weights tensor in Z dimension (in bytes)
- * @param[in] w_step_z w_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] w_offset_first_element_in_bytes The offset of the first element in the weights tensor
- * @param[in] b_ptr (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
- * @param[in] b_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
- * @param[in] b_step_x (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] b_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
- * @param[in] b_step_y (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] b_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
- * @param[in] b_step_z (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] b_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p w_ptr
- * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
- * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p w_ptr
- * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
- * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
- * @param[out] w_fused_ptr (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
- * @param[in] w_fused_stride_x (Optional) Stride of the destination weights tensor in X dimension (in bytes)
- * @param[in] w_fused_step_x (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] w_fused_stride_y (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
- * @param[in] w_fused_step_y (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] w_fused_stride_z (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
- * @param[in] w_fused_step_z (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
- * @param[in] b_fused_ptr (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
- * @param[in] b_fused_stride_x (Optional) Stride of the destination bias tensor in X dimension (in bytes)
- * @param[in] b_fused_step_x (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
- * @param[in] beta_ptr (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
- * @param[in] beta_stride_x (Optional) Stride of the beta source tensor in X dimension (in bytes)
- * @param[in] beta_step_x (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in the beta source tensor
- * @param[in] gamma_ptr (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
- * @param[in] gamma_stride_x (Optional) Stride of the gamma source tensor in X dimension (in bytes)
- * @param[in] gamma_step_x (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in the gamma source tensor
- */
-__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
-#if defined(BIAS)
- VECTOR_DECLARATION(b),
-#endif // defined(BIAS)
- VECTOR_DECLARATION(mean),
- VECTOR_DECLARATION(var)
-#ifndef IN_PLACE_W
- ,
- TENSOR3D_DECLARATION(w_fused)
-#endif // ifndef IN_PLACE_W
-#ifndef IN_PLACE_B
- ,
- VECTOR_DECLARATION(b_fused)
-#endif // ifndef IN_PLACE_B
-#if defined(BETA)
- ,
- VECTOR_DECLARATION(beta)
-#endif // defined(BETA)
-#if defined(GAMMA)
- ,
- VECTOR_DECLARATION(gamma)
-#endif // defined(GAMMA)
- )
-{
- int x = get_global_id(0);
- int y = get_global_id(1);
- int z = get_global_id(2);
-
-#if defined(DIM2)
- int c0 = z % DIM2;
- int c1 = z / DIM2;
-#else // ! defined(DIM2)
- int c0 = 0;
-#if defined(NHWC)
- int c1 = x;
-#else // defined(NHWC)
- int c1 = z;
-#endif // defined(NHWC)
-#endif // defined(DIM2)
-
- int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
- int v_offset = c1 * sizeof(DATA_TYPE);
-
- DATA_TYPE w_old = 0.0f;
- DATA_TYPE b_old = 0.0f;
- DATA_TYPE w_new = 0.0f;
- DATA_TYPE b_new = 0.0f;
- DATA_TYPE gamma = 1.0f;
- DATA_TYPE mean = 0.0f;
- DATA_TYPE var = 1.0f;
- DATA_TYPE beta = 0.0f;
-
- w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
- var = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
- mean = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
-
-#if defined(GAMMA)
- gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
-#endif // defined(GAMMA)
-
- // Compute new weight
- w_new = (gamma * w_old) / (sqrt(var + EPSILON));
-
-#if defined(IN_PLACE_W)
- *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
-#else // defined(IN_PLACE_W)
- *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
-#endif // defined(IN_PLACE_W)
-
- // Compute bias
-#if !defined(DIM2) && defined(NHWC)
- if(z == 0 && y == 0)
-#else // !defined(DIM2) && defined(NHWC)
- if(x == 0 && y == 0 && c0 == 0)
-#endif // !defined(DIM2) && defined(NHWC)
- {
-#if defined(BIAS)
- b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
-#endif // defined(BIAS)
-#if defined(BETA)
- beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
-#endif // defined(BETA)
-
- b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
-
-#if defined(BIAS)
-
-#if defined(IN_PLACE_B)
- *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
-#else // defined(IN_PLACE_B)
- *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // defined(IN_PLACE_B)
-
-#else // defined(BIAS)
-
-#ifndef IN_PLACE_B
- *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
-#endif // ifndef IN_PLACE_B
-
-#endif // defined(BIAS)
- }
-}
-#endif // defined(DATA_TYPE) && defined(EPSILON) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/activation_layer.cl b/src/core/CL/cl_kernels/common/activation_layer.cl
index bc2c99b6c8..a04556a1ed 100644
--- a/src/core/CL/cl_kernels/activation_layer.cl
+++ b/src/core/CL/cl_kernels/common/activation_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/activation_layer_quant.cl b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
index 66261019ab..38ee00b17a 100644
--- a/src/core/CL/cl_kernels/activation_layer_quant.cl
+++ b/src/core/CL/cl_kernels/common/activation_layer_quant.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/arg_min_max.cl b/src/core/CL/cl_kernels/common/arg_min_max.cl
index 6e57ed0af1..6e57ed0af1 100644
--- a/src/core/CL/cl_kernels/arg_min_max.cl
+++ b/src/core/CL/cl_kernels/common/arg_min_max.cl
diff --git a/src/core/CL/cl_kernels/common/batchnormalization_layer.cl b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
new file mode 100644
index 0000000000..18f54907df
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/batchnormalization_layer.cl
@@ -0,0 +1,183 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(EPSILON)
+/** OpenCL kernel to fuse the weights of convolution or depthwise convolution layer with batch normalization when the data layout is either NCHW or NHWC
+ *
+ * @note The input weights tensor is assumed 4D with the OFMs in the fourth dimension
+ * @note Data type should be passed at compile time using the -DDATA_TYPE, e.g. -DDATA_TYPE=float
+ * @note The third dimension of the input tensor should be passed at compile time when weights belong to a convolution layer using -DDIM2=size. e.g. -DDIM2=16.
+ * For depthwise convolution weight do not pass DIM2
+ * @note Data layout NHWC should be passed at compile time with -DNHWC. For data layout NCHW it is not required to pass any parameter
+ * @note Batch normalization epsilon parameter should be passed at compile time using -DEPSILON=value. e.g. -DEPSILON=0.001f
+ *
+ * @param[in] w_ptr Pointer to the weights tensor. Supported data types: F16/F32
+ * @param[in] w_stride_x Stride of the weights tensor in X dimension (in bytes)
+ * @param[in] w_step_x w_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] w_stride_y Stride of the weights tensor in Y dimension (in bytes)
+ * @param[in] w_step_y w_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] w_stride_z Stride of the weights tensor in Z dimension (in bytes)
+ * @param[in] w_step_z w_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] w_offset_first_element_in_bytes The offset of the first element in the weights tensor
+ * @param[in] b_ptr (Optional) Pointer to the bias tensor. Supported data types: same as @p w_ptr
+ * @param[in] b_stride_x (Optional) Stride of the bias tensor in X dimension (in bytes)
+ * @param[in] b_step_x (Optional) b_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] b_stride_y (Optional) Stride of the bias tensor in Y dimension (in bytes)
+ * @param[in] b_step_y (Optional) b_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] b_stride_z (Optional) Stride of the bias tensor in Z dimension (in bytes)
+ * @param[in] b_step_z (Optional) b_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] b_offset_first_element_in_bytes (Optional) The offset of the first element in the bias tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p w_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p w_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[out] w_fused_ptr (Optional) Pointer to the destination weights tensors. Supported data types: same as @p w_ptr
+ * @param[in] w_fused_stride_x (Optional) Stride of the destination weights tensor in X dimension (in bytes)
+ * @param[in] w_fused_step_x (Optional) w_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] w_fused_stride_y (Optional) Stride of the destination weights tensor in Y dimension (in bytes)
+ * @param[in] w_fused_step_y (Optional) w_fused_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] w_fused_stride_z (Optional) Stride of the destination weights tensor in Z dimension (in bytes)
+ * @param[in] w_fused_step_z (Optional) w_fused_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] w_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination weights tensor
+ * @param[in] b_fused_ptr (Optional) Pointer to the destination bias tensor. Supported data types: same as @p w_ptr
+ * @param[in] b_fused_stride_x (Optional) Stride of the destination bias tensor in X dimension (in bytes)
+ * @param[in] b_fused_step_x (Optional) b_fused_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] b_fused_offset_first_element_in_bytes (Optional) The offset of the first element in the destination bias tensor
+ * @param[in] beta_ptr (Optional) Pointer to the beta source tensor. Supported data types: same as @p w_ptr
+ * @param[in] beta_stride_x (Optional) Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x (Optional) beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes (Optional) The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr (Optional) Pointer to the gamma source tensor. Supported data types: same as @p w_ptr
+ * @param[in] gamma_stride_x (Optional) Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x (Optional) gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes (Optional) The offset of the first element in the gamma source tensor
+ */
+__kernel void fuse_batchnormalization_layer(TENSOR3D_DECLARATION(w),
+#if defined(BIAS)
+ VECTOR_DECLARATION(b),
+#endif // defined(BIAS)
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var)
+#ifndef IN_PLACE_W
+ ,
+ TENSOR3D_DECLARATION(w_fused)
+#endif // ifndef IN_PLACE_W
+#ifndef IN_PLACE_B
+ ,
+ VECTOR_DECLARATION(b_fused)
+#endif // ifndef IN_PLACE_B
+#if defined(BETA)
+ ,
+ VECTOR_DECLARATION(beta)
+#endif // defined(BETA)
+#if defined(GAMMA)
+ ,
+ VECTOR_DECLARATION(gamma)
+#endif // defined(GAMMA)
+ )
+{
+ int x = get_global_id(0);
+ int y = get_global_id(1);
+ int z = get_global_id(2);
+
+#if defined(DIM2)
+ int c0 = z % DIM2;
+ int c1 = z / DIM2;
+#else // ! defined(DIM2)
+ int c0 = 0;
+#if defined(NHWC)
+ int c1 = x;
+#else // defined(NHWC)
+ int c1 = z;
+#endif // defined(NHWC)
+#endif // defined(DIM2)
+
+ int w_offset = x * sizeof(DATA_TYPE) + y * w_stride_y + z * w_stride_z;
+ int v_offset = c1 * sizeof(DATA_TYPE);
+
+ DATA_TYPE w_old = 0.0f;
+ DATA_TYPE b_old = 0.0f;
+ DATA_TYPE w_new = 0.0f;
+ DATA_TYPE b_new = 0.0f;
+ DATA_TYPE gamma = 1.0f;
+ DATA_TYPE mean = 0.0f;
+ DATA_TYPE var = 1.0f;
+ DATA_TYPE beta = 0.0f;
+
+ w_old = *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes));
+ var = *((__global DATA_TYPE *)(var_ptr + v_offset + var_offset_first_element_in_bytes));
+ mean = *((__global DATA_TYPE *)(mean_ptr + v_offset + mean_offset_first_element_in_bytes));
+
+#if defined(GAMMA)
+ gamma = *((__global DATA_TYPE *)(gamma_ptr + v_offset + gamma_offset_first_element_in_bytes));
+#endif // defined(GAMMA)
+
+ // Compute new weight
+ w_new = (gamma * w_old) / (sqrt(var + EPSILON));
+
+#if defined(IN_PLACE_W)
+ *((__global DATA_TYPE *)(w_ptr + w_offset + w_offset_first_element_in_bytes)) = w_new;
+#else // defined(IN_PLACE_W)
+ *((__global DATA_TYPE *)(w_fused_ptr + w_offset + w_fused_offset_first_element_in_bytes)) = w_new;
+#endif // defined(IN_PLACE_W)
+
+ // Compute bias
+#if !defined(DIM2) && defined(NHWC)
+ if(z == 0 && y == 0)
+#else // !defined(DIM2) && defined(NHWC)
+ if(x == 0 && y == 0 && c0 == 0)
+#endif // !defined(DIM2) && defined(NHWC)
+ {
+#if defined(BIAS)
+ b_old = *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes));
+#endif // defined(BIAS)
+#if defined(BETA)
+ beta = *((__global DATA_TYPE *)(beta_ptr + v_offset + beta_offset_first_element_in_bytes));
+#endif // defined(BETA)
+
+ b_new = ((gamma * (b_old - mean)) / (sqrt(var + EPSILON))) + beta;
+
+#if defined(BIAS)
+
+#if defined(IN_PLACE_B)
+ *((__global DATA_TYPE *)(b_ptr + v_offset + b_offset_first_element_in_bytes)) = b_new;
+#else // defined(IN_PLACE_B)
+ *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // defined(IN_PLACE_B)
+
+#else // defined(BIAS)
+
+#ifndef IN_PLACE_B
+ *((__global DATA_TYPE *)(b_fused_ptr + v_offset + b_fused_offset_first_element_in_bytes)) = b_new;
+#endif // ifndef IN_PLACE_B
+
+#endif // defined(BIAS)
+ }
+}
+#endif // defined(DATA_TYPE) && defined(EPSILON) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/bitwise_op.cl b/src/core/CL/cl_kernels/common/bitwise_op.cl
index a600bced9e..e142c1d275 100644
--- a/src/core/CL/cl_kernels/bitwise_op.cl
+++ b/src/core/CL/cl_kernels/common/bitwise_op.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/bounding_box_transform.cl b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
index f2e9cb0ed0..f2e9cb0ed0 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform.cl
diff --git a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
index c1d45a56b9..c1d45a56b9 100644
--- a/src/core/CL/cl_kernels/bounding_box_transform_quantized.cl
+++ b/src/core/CL/cl_kernels/common/bounding_box_transform_quantized.cl
diff --git a/src/core/CL/cl_kernels/cast.cl b/src/core/CL/cl_kernels/common/cast.cl
index 036a683ec7..036a683ec7 100644
--- a/src/core/CL/cl_kernels/cast.cl
+++ b/src/core/CL/cl_kernels/common/cast.cl
diff --git a/src/core/CL/cl_kernels/col2im.cl b/src/core/CL/cl_kernels/common/col2im.cl
index 59c2d8a3aa..89054dcb31 100644
--- a/src/core/CL/cl_kernels/col2im.cl
+++ b/src/core/CL/cl_kernels/common/col2im.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/comparisons.cl b/src/core/CL/cl_kernels/common/comparisons.cl
index 408846144d..f05cb87835 100644
--- a/src/core/CL/cl_kernels/comparisons.cl
+++ b/src/core/CL/cl_kernels/common/comparisons.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/common/concatenate.cl
index d2e65408dc..394b20c739 100644
--- a/src/core/CL/cl_kernels/concatenate.cl
+++ b/src/core/CL/cl_kernels/common/concatenate.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/convert_fc_weights.cl b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
index a451c0213b..01ef04a7d6 100644
--- a/src/core/CL/cl_kernels/convert_fc_weights.cl
+++ b/src/core/CL/cl_kernels/common/convert_fc_weights.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/convolution_layer.cl b/src/core/CL/cl_kernels/common/convolution_layer.cl
index cfd1f12328..be76929ac8 100644
--- a/src/core/CL/cl_kernels/convolution_layer.cl
+++ b/src/core/CL/cl_kernels/common/convolution_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2019 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/copy_tensor.cl b/src/core/CL/cl_kernels/common/copy_tensor.cl
index 9c90969827..753b98d1b0 100644
--- a/src/core/CL/cl_kernels/copy_tensor.cl
+++ b/src/core/CL/cl_kernels/common/copy_tensor.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/crop_tensor.cl b/src/core/CL/cl_kernels/common/crop_tensor.cl
index d9090dc838..d9090dc838 100644
--- a/src/core/CL/cl_kernels/crop_tensor.cl
+++ b/src/core/CL/cl_kernels/common/crop_tensor.cl
diff --git a/src/core/CL/cl_kernels/deconvolution_layer.cl b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
index b1d5e61476..4ac5e3f0e9 100644
--- a/src/core/CL/cl_kernels/deconvolution_layer.cl
+++ b/src/core/CL/cl_kernels/common/deconvolution_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/common/dequantization_layer.cl b/src/core/CL/cl_kernels/common/dequantization_layer.cl
new file mode 100644
index 0000000000..7fa62577ce
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/dequantization_layer.cl
@@ -0,0 +1,90 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
+
+/** This performs the dequantization of 8-bit unsigned integers to floating point.
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
+ * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void dequantization_layer(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+ // Load data
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+ // Create scale and offset vectors
+ const VEC_DATA_TYPE(float, VEC_SIZE)
+ vscale = SCALE;
+
+ const VEC_DATA_TYPE(int, VEC_SIZE)
+ voffset = OFFSET;
+
+ // Dequantize
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else // !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/elementwise_operation.cl b/src/core/CL/cl_kernels/common/elementwise_operation.cl
index 45dcbfc6e2..45dcbfc6e2 100644
--- a/src/core/CL/cl_kernels/elementwise_operation.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_operation.cl
diff --git a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
index a11be80875..a11be80875 100644
--- a/src/core/CL/cl_kernels/elementwise_operation_quantized.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_operation_quantized.cl
diff --git a/src/core/CL/cl_kernels/elementwise_unary.cl b/src/core/CL/cl_kernels/common/elementwise_unary.cl
index d2d9d97d33..d2d9d97d33 100644
--- a/src/core/CL/cl_kernels/elementwise_unary.cl
+++ b/src/core/CL/cl_kernels/common/elementwise_unary.cl
diff --git a/src/core/CL/cl_kernels/fft.cl b/src/core/CL/cl_kernels/common/fft.cl
index 51763a620a..3f26d0f1a6 100644
--- a/src/core/CL/cl_kernels/fft.cl
+++ b/src/core/CL/cl_kernels/common/fft.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/fft_digit_reverse.cl b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
index de566212c6..5f64d95bf9 100644
--- a/src/core/CL/cl_kernels/fft_digit_reverse.cl
+++ b/src/core/CL/cl_kernels/common/fft_digit_reverse.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/fft_scale.cl b/src/core/CL/cl_kernels/common/fft_scale.cl
index 57e25ef504..c799dd3b9e 100644
--- a/src/core/CL/cl_kernels/fft_scale.cl
+++ b/src/core/CL/cl_kernels/common/fft_scale.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/fill_border.cl b/src/core/CL/cl_kernels/common/fill_border.cl
index 5775d899e8..a43343c9f4 100644
--- a/src/core/CL/cl_kernels/fill_border.cl
+++ b/src/core/CL/cl_kernels/common/fill_border.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/floor.cl b/src/core/CL/cl_kernels/common/floor.cl
index f6dd4edd2e..f6dd4edd2e 100644
--- a/src/core/CL/cl_kernels/floor.cl
+++ b/src/core/CL/cl_kernels/common/floor.cl
diff --git a/src/core/CL/cl_kernels/gather.cl b/src/core/CL/cl_kernels/common/gather.cl
index 41f439cb47..76eaefa92e 100644
--- a/src/core/CL/cl_kernels/gather.cl
+++ b/src/core/CL/cl_kernels/common/gather.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/gemm.cl b/src/core/CL/cl_kernels/common/gemm.cl
index 10435d376f..10435d376f 100644
--- a/src/core/CL/cl_kernels/gemm.cl
+++ b/src/core/CL/cl_kernels/common/gemm.cl
diff --git a/src/core/CL/cl_kernels/gemm_v1.cl b/src/core/CL/cl_kernels/common/gemm_v1.cl
index a136a1b96b..a136a1b96b 100644
--- a/src/core/CL/cl_kernels/gemm_v1.cl
+++ b/src/core/CL/cl_kernels/common/gemm_v1.cl
diff --git a/src/core/CL/cl_kernels/gemmlowp.cl b/src/core/CL/cl_kernels/common/gemmlowp.cl
index 5cafb5389c..5cafb5389c 100644
--- a/src/core/CL/cl_kernels/gemmlowp.cl
+++ b/src/core/CL/cl_kernels/common/gemmlowp.cl
diff --git a/src/core/CL/cl_kernels/gemv.cl b/src/core/CL/cl_kernels/common/gemv.cl
index aaa83975f8..71a372eb29 100644
--- a/src/core/CL/cl_kernels/gemv.cl
+++ b/src/core/CL/cl_kernels/common/gemv.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/generate_proposals.cl b/src/core/CL/cl_kernels/common/generate_proposals.cl
index e8306c55a8..5b8502072a 100644
--- a/src/core/CL/cl_kernels/generate_proposals.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/generate_proposals_quantized.cl b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
index 04264197f4..70f861c4b7 100644
--- a/src/core/CL/cl_kernels/generate_proposals_quantized.cl
+++ b/src/core/CL/cl_kernels/common/generate_proposals_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/instance_normalization.cl b/src/core/CL/cl_kernels/common/instance_normalization.cl
index adfbebd67d..adfbebd67d 100644
--- a/src/core/CL/cl_kernels/instance_normalization.cl
+++ b/src/core/CL/cl_kernels/common/instance_normalization.cl
diff --git a/src/core/CL/cl_kernels/l2_normalize.cl b/src/core/CL/cl_kernels/common/l2_normalize.cl
index fbe3406239..fbe3406239 100644
--- a/src/core/CL/cl_kernels/l2_normalize.cl
+++ b/src/core/CL/cl_kernels/common/l2_normalize.cl
diff --git a/src/core/CL/cl_kernels/mean_stddev_normalization.cl b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
index 76be629934..05727a6aa6 100644
--- a/src/core/CL/cl_kernels/mean_stddev_normalization.cl
+++ b/src/core/CL/cl_kernels/common/mean_stddev_normalization.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019, 2021 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/memset.cl b/src/core/CL/cl_kernels/common/memset.cl
index bb46a49f84..9ff25f3af4 100644
--- a/src/core/CL/cl_kernels/memset.cl
+++ b/src/core/CL/cl_kernels/common/memset.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/minmax_layer.cl b/src/core/CL/cl_kernels/common/minmax_layer.cl
index 655696f9a1..49356451df 100644
--- a/src/core/CL/cl_kernels/minmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/minmax_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/nonmax.cl b/src/core/CL/cl_kernels/common/nonmax.cl
index ab13131807..702e635a89 100644
--- a/src/core/CL/cl_kernels/nonmax.cl
+++ b/src/core/CL/cl_kernels/common/nonmax.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/pad_layer.cl b/src/core/CL/cl_kernels/common/pad_layer.cl
index 903e924a2f..5ae4ec884d 100644
--- a/src/core/CL/cl_kernels/pad_layer.cl
+++ b/src/core/CL/cl_kernels/common/pad_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/permute.cl b/src/core/CL/cl_kernels/common/permute.cl
index db9e7ecc25..a03eeb1a19 100644
--- a/src/core/CL/cl_kernels/permute.cl
+++ b/src/core/CL/cl_kernels/common/permute.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_float.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
index 10875293a9..10875293a9 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_float.cl
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_float.cl
diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
index 6d1c2d0c79..6d1c2d0c79 100644
--- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl
+++ b/src/core/CL/cl_kernels/common/pixelwise_mul_int.cl
diff --git a/src/core/CL/cl_kernels/common/pooling_layer.cl b/src/core/CL/cl_kernels/common/pooling_layer.cl
new file mode 100644
index 0000000000..5122f2c251
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/pooling_layer.cl
@@ -0,0 +1,390 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if STRIDE_X == 1
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
+#elif STRIDE_X == 2 /* STRIDE_X == 1 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
+#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
+#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
+#endif /* STRIDE_X == 3 */
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+ CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+#define POOLING3x3_STRIDE1(res, input, output) \
+ ({ \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
+ data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
+ data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
+ data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \
+ data00 = POW2_OP(data00, 4); \
+ data01 = POW2_OP(data01, 2); \
+ data10 = POW2_OP(data10, 4); \
+ data11 = POW2_OP(data11, 2); \
+ data20 = POW2_OP(data20, 4); \
+ data21 = POW2_OP(data21, 2); \
+ \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \
+ \
+ values00 = POOL_OP(values00, values10); \
+ values01 = POOL_OP(values01, values11); \
+ values00 = POOL_OP(values00, values20); \
+ values01 = POOL_OP(values01, values21); \
+ \
+ res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+ res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \
+ })
+
+#define POOLING3x3_STRIDE2(res, input, output) \
+ ({ \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
+ ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
+ ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
+ ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8)); \
+ data00 = POW2_OP(data00, 8); \
+ data01 = POW2_OP(data01, 1); \
+ data10 = POW2_OP(data10, 8); \
+ data11 = POW2_OP(data11, 1); \
+ data20 = POW2_OP(data20, 8); \
+ data21 = POW2_OP(data21, 1); \
+ \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21); \
+ \
+ values00 = POOL_OP(values00, values10); \
+ values01 = POOL_OP(values01, values11); \
+ values00 = POOL_OP(values00, values20); \
+ values01 = POOL_OP(values01, values21); \
+ \
+ res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
+ res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \
+ })
+
+#define POOLING3x3_STRIDE3(res, input, output) \
+ ({ \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
+ data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
+ data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
+ data00 = POW2_OP(data00, 8); \
+ data01 = POW2_OP(data01, 4); \
+ data10 = POW2_OP(data10, 8); \
+ data11 = POW2_OP(data11, 4); \
+ data20 = POW2_OP(data20, 8); \
+ data21 = POW2_OP(data21, 4); \
+ \
+ data00 = POOL_OP(data00, data10); \
+ data01 = POOL_OP(data01, data11); \
+ data00 = POOL_OP(data00, data20); \
+ data01 = POOL_OP(data01, data21); \
+ \
+ res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
+ res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03)); \
+ })
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(0) * stride_x - pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to 2.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_2(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ // Load data
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+ data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+ data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 2);
+ data1 = POW2_OP(data1, 2);
+#endif /* defined(POOL_L2) */
+
+ // Perform calculations
+ data0 = POOL_OP(data0, data1);
+ ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average or l2 pooling
+ res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+/** Performs a pooling function of pool size equal to 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_3(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ // Load data
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+ data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+ data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
+ data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 = POW2_OP(data0, 3);
+ data1 = POW2_OP(data1, 3);
+ data2 = POW2_OP(data2, 3);
+#endif /* defined(POOL_L2) */
+
+ // Perform calculations
+ data0 = POOL_OP(data0, data1);
+ data0 = POOL_OP(data0, data2);
+ ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+
+#if defined(POOLING3x3)
+
+#define CONVERT_OP(data_type) convert_##data_type##4
+#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
+
+VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w);
+ const int end_y = min(start_y + pool_size, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max((int4)0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
+}
+
+/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_optimized_3(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+ res;
+
+ // Perform pooling 3x3 for 4 output elements
+ POOLING3x3(res, input, output);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
+}
+#endif // defined(POOLING3x3)
diff --git a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
index 24cb111772..4494dd8cec 100644
--- a/src/core/CL/cl_kernels/qlstm_layer_normalization.cl
+++ b/src/core/CL/cl_kernels/common/qlstm_layer_normalization.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/quantization_layer.cl b/src/core/CL/cl_kernels/common/quantization_layer.cl
index 3538dae5f0..69cc288c25 100644
--- a/src/core/CL/cl_kernels/quantization_layer.cl
+++ b/src/core/CL/cl_kernels/common/quantization_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -80,8 +80,8 @@ __kernel void quantization_layer(
// Create scale and offset vectors
const VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE) vscale = SCALE;
- const VEC_DATA_TYPE(int, VEC_SIZE) voffset = OFFSET;
-#else // defined(IS_FLOAT)
+ const VEC_DATA_TYPE(int, VEC_SIZE) voffset = OFFSET;
+#else // defined(IS_FLOAT)
// Load data
VEC_DATA_TYPE(DATA_TYPE_IN, VEC_SIZE)
val = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_IN *)input.ptr);
diff --git a/src/core/CL/cl_kernels/range.cl b/src/core/CL/cl_kernels/common/range.cl
index d25d10e207..d25d10e207 100644
--- a/src/core/CL/cl_kernels/range.cl
+++ b/src/core/CL/cl_kernels/common/range.cl
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/common/reduction_operation.cl
index 9f2c6e23b5..9f2c6e23b5 100644
--- a/src/core/CL/cl_kernels/reduction_operation.cl
+++ b/src/core/CL/cl_kernels/common/reduction_operation.cl
diff --git a/src/core/CL/cl_kernels/reshape_layer.cl b/src/core/CL/cl_kernels/common/reshape_layer.cl
index 2d6a7edade..bfdefc863e 100644
--- a/src/core/CL/cl_kernels/reshape_layer.cl
+++ b/src/core/CL/cl_kernels/common/reshape_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/reverse.cl b/src/core/CL/cl_kernels/common/reverse.cl
index 10ffe84aeb..6b0afb9c2c 100644
--- a/src/core/CL/cl_kernels/reverse.cl
+++ b/src/core/CL/cl_kernels/common/reverse.cl
@@ -1,5 +1,5 @@
/*
-* Copyright (c) 2018-2020 Arm Limited.
+* Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/roi_align_layer.cl b/src/core/CL/cl_kernels/common/roi_align_layer.cl
index e0b98e68c9..8cfe5ddcb6 100644
--- a/src/core/CL/cl_kernels/roi_align_layer.cl
+++ b/src/core/CL/cl_kernels/common/roi_align_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -173,7 +173,7 @@ __kernel void roi_align_layer(
const float2 roi_bin_grid = SAMPLING_RATIO;
#else // !defined(SAMPLING_RATIO)
// Note that we subtract EPS_GRID before ceiling. This is to avoid situations where 1.000001 gets ceiled to 2.
- const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
+ const float2 roi_bin_grid = ceil(bin_size - EPS_GRID);
#endif // defined(SAMPLING_RATIO)
// Move input and output pointer across the fourth dimension
@@ -184,7 +184,7 @@ __kernel void roi_align_layer(
#if defined(NHWC)
__global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, pz, px, py);
#else // !defined(NHWC)
- __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
+ __global DATA_TYPE *_output_ptr = (__global DATA_TYPE *)tensor3D_offset(&output, px, py, pz);
#endif // defined(NHWC)
*_output_ptr = (__global DATA_TYPE)roi_align_1x1(&input,
region_start.x,
diff --git a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
index d5c9a0d9bf..e75dee06f6 100644
--- a/src/core/CL/cl_kernels/roi_align_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/common/roi_align_layer_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2019-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/roi_pooling_layer.cl b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
index 6899b952e0..6899b952e0 100644
--- a/src/core/CL/cl_kernels/roi_pooling_layer.cl
+++ b/src/core/CL/cl_kernels/common/roi_pooling_layer.cl
diff --git a/src/core/CL/cl_kernels/select.cl b/src/core/CL/cl_kernels/common/select.cl
index 6fd4bd4ce3..6fd4bd4ce3 100644
--- a/src/core/CL/cl_kernels/select.cl
+++ b/src/core/CL/cl_kernels/common/select.cl
diff --git a/src/core/CL/cl_kernels/slice_ops.cl b/src/core/CL/cl_kernels/common/slice_ops.cl
index dc3ffd91c1..d12c60f5ea 100644
--- a/src/core/CL/cl_kernels/slice_ops.cl
+++ b/src/core/CL/cl_kernels/common/slice_ops.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/softmax_layer.cl b/src/core/CL/cl_kernels/common/softmax_layer.cl
index 4d2d89dd73..4d2d89dd73 100644
--- a/src/core/CL/cl_kernels/softmax_layer.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer.cl
diff --git a/src/core/CL/cl_kernels/softmax_layer_quantized.cl b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
index 4d5006d804..4d5006d804 100644
--- a/src/core/CL/cl_kernels/softmax_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/common/softmax_layer_quantized.cl
diff --git a/src/core/CL/cl_kernels/stack_layer.cl b/src/core/CL/cl_kernels/common/stack_layer.cl
index 438e858df2..2468bf750d 100644
--- a/src/core/CL/cl_kernels/stack_layer.cl
+++ b/src/core/CL/cl_kernels/common/stack_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/tile.cl b/src/core/CL/cl_kernels/common/tile.cl
index 79da7fe6b9..4332411688 100644
--- a/src/core/CL/cl_kernels/tile.cl
+++ b/src/core/CL/cl_kernels/common/tile.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/transpose.cl b/src/core/CL/cl_kernels/common/transpose.cl
index 82db2908b5..82db2908b5 100644
--- a/src/core/CL/cl_kernels/transpose.cl
+++ b/src/core/CL/cl_kernels/common/transpose.cl
diff --git a/src/core/CL/cl_kernels/unpooling_layer.cl b/src/core/CL/cl_kernels/common/unpooling_layer.cl
index 457e9bf8f1..6662dc9360 100644
--- a/src/core/CL/cl_kernels/unpooling_layer.cl
+++ b/src/core/CL/cl_kernels/common/unpooling_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2020 Arm Limited.
+ * Copyright (c) 2020-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl
deleted file mode 100644
index 127f67d940..0000000000
--- a/src/core/CL/cl_kernels/dequantization_layer.cl
+++ /dev/null
@@ -1,212 +0,0 @@
-/*
- * Copyright (c) 2017-2020 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-/** This performs the dequantization of 8-bit unsigned integers to floating point.
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Quantization scale of input tensor is passed in with -DSCALE=scale.
- * @note Quantization offset of input tensor is passed in with -DOFFSET=offset.
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED/QSYMM8
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void dequantization_layer(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
- // Check if access on width gets out of bounds
- // If it does shift access vector to access elements within bounds
- const int xi = (int)(get_global_id(0) * VEC_SIZE);
- input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
- output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
- // Load data
- VEC_DATA_TYPE(int, VEC_SIZE)
- val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
- // Create scale and offset vectors
- const VEC_DATA_TYPE(float, VEC_SIZE)
- vscale = SCALE;
-
- const VEC_DATA_TYPE(int, VEC_SIZE)
- voffset = OFFSET;
-
- // Dequantize
- VEC_DATA_TYPE(float, VEC_SIZE)
- res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE));
-
- // Store result
- VSTORE(VEC_SIZE)
- (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else // !defined(LAST_ACCESSED_X)
- *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE));
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET)
-
-#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] scale Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nchw(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- __global float *scale)
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
- // Check if access on width gets out of bounds
- // If it does shift access vector to access elements within bounds
- const int xi = (int)(get_global_id(0) * VEC_SIZE);
- input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
- output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
-
- // Load data
- VEC_DATA_TYPE(int, VEC_SIZE)
- val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
- // Create scale vectors
- const VEC_DATA_TYPE(float, VEC_SIZE)
- vscale = scale[get_global_id(2)];
-
- // Dequantize
- VEC_DATA_TYPE(float, VEC_SIZE)
- res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
- // Store result
- VSTORE(VEC_SIZE)
- (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else // !defined(LAST_ACCESSED_X)
- *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
- *
- * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
- * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] scale Pointer to buffer with the per channel quantized scales
- */
-__kernel void dequantization_layer_per_channel_nhwc(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- __global float *scale)
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
-#if defined(LAST_ACCESSED_X)
- // Check if access on width gets out of bounds
- // If it does shift access vector to access elements within bounds
- const int xi = (int)(get_global_id(0) * VEC_SIZE);
- input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
- output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
- scale -= max(xi - (int)LAST_ACCESSED_X, 0);
-
- // Load data
- VEC_DATA_TYPE(int, VEC_SIZE)
- val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
-
- // Create scale vectors
- const VEC_DATA_TYPE(float, VEC_SIZE)
- vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
-
- // Dequantize
- VEC_DATA_TYPE(float, VEC_SIZE)
- res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
-
- // Store result
- VSTORE(VEC_SIZE)
- (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
-#else // !defined(LAST_ACCESSED_X)
- *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
-#endif // defined(LAST_ACCESSED_X)
-}
-#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
diff --git a/src/core/CL/cl_kernels/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
index f301e64d66..89129cff3f 100644
--- a/src/core/CL/cl_kernels/depth_to_space.cl
+++ b/src/core/CL/cl_kernels/nchw/batch_to_space.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2021 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,14 +23,14 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Depth to space transformation. (NCHW)
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NCHW)
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -39,6 +39,12 @@
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
* @param[in] batch_id The input tensor batch id
+ * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
* @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -48,31 +54,41 @@
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void depth_to_space_nchw(
+__kernel void batch_to_space_nchw(
TENSOR3D_DECLARATION(input),
const int batch_id,
+ VECTOR_DECLARATION(block_shape),
TENSOR4D_DECLARATION(output))
{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+
+ const int block_x = *((__global int *)vector_offset(&block, 0));
+ const int block_y = *((__global int *)vector_offset(&block, 1));
- const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int r = (BATCH_SIZE / (block_x * block_y));
const int x = get_global_id(0);
const int y = get_global_id(1);
- const int z = get_global_id(2) % r;
+ const int z = get_global_id(2);
+ const int w = batch_id % r;
- const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
- const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
- *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
}
-/** Depth to space transformation. (NHWC)
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NCHW)
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
*
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
* @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
* @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
@@ -90,7 +106,7 @@ __kernel void depth_to_space_nchw(
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void depth_to_space_nhwc(
+__kernel void batch_to_space_static_nchw(
TENSOR3D_DECLARATION(input),
const int batch_id,
TENSOR4D_DECLARATION(output))
@@ -98,14 +114,18 @@ __kernel void depth_to_space_nhwc(
Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
- const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
- const int x = get_global_id(1);
- const int y = get_global_id(2);
- const int z = get_global_id(0) % r;
+ const int block_x = BLOCK_SHAPE_X;
+ const int block_y = BLOCK_SHAPE_Y;
+
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2);
+ const int w = batch_id % r;
- const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
- const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
- *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, w)) = *((__global DATA_TYPE *)in.ptr);
}
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
new file mode 100644
index 0000000000..2d466661b3
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/batchnormalization_layer.cl
@@ -0,0 +1,147 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nchw(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+ TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+ VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+ float epsilon)
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+#ifdef IN_PLACE
+ Tensor3D out = in;
+#else /* IN_PLACE */
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+#endif /* IN_PLACE */
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector var = CONVERT_TO_VECTOR_STRUCT(var);
+#ifndef USE_DEFAULT_BETA
+ Vector beta = CONVERT_TO_VECTOR_STRUCT(beta);
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ Vector gamma = CONVERT_TO_VECTOR_STRUCT(gamma);
+#endif /* USE_DEFAULT_GAMMA */
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ denominator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ numerator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ x_bar = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res = 0;
+
+ const int current_slice = get_global_id(2);
+
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+ denominator = *((__global DATA_TYPE *)(var.ptr + current_slice * var.stride_x));
+ denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+ // Calculate x bar and store results
+ numerator = *((__global DATA_TYPE *)(mean.ptr + current_slice * mean.stride_x));
+ numerator = SUB_OP(data, numerator);
+ x_bar = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ gamma_vec = *((__global DATA_TYPE *)(gamma.ptr + current_slice * gamma.stride_x));
+
+ res = MUL_OP(gamma_vec, x_bar);
+#else /* USE_DEFAULT_GAMMA */
+ // gamma is equal to 1, no need to perform multiplications
+ res = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_vec = *((__global DATA_TYPE *)(beta.ptr + current_slice * beta.stride_x));
+ // beta is not zero, hence we need to perform the addition
+ res = ADD_OP(res, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+ res = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res, A_VAL, B_VAL);
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/channel_shuffle.cl b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
new file mode 100644
index 0000000000..57d82e1e6f
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/channel_shuffle.cl
@@ -0,0 +1,103 @@
+/*
+* Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+
+// Check valid VEC_SIZES
+#if VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+#error "Only vector sizes 1, 2, 3, 4, 8 and 16 are supported"
+#endif // VEC_SIZE != 1 && VEC_SIZE != 2 && VEC_SIZE != 3 && VEC_SIZE != 4 && VEC_SIZE != 8 && VEC_SIZE != 16
+
+#define DIV_MOD_UINT(x, y, div_res, mod_res) \
+ ({ \
+ div_res = (uint)((x) * (float)(1.0f / (float)(y))); \
+ uint r = div_res * (y); \
+ mod_res = (x)-r; \
+ })
+
+/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
+ *
+ * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
+ * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
+ * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
+ * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
+ * K is equal to num_channels / num_groups.
+ *
+ * @param[in] src_ptr Pointer to the source matrix. Supported data types: All
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst))
+{
+ uint curr_channel = 0; // channel id of input
+ uint batch_id = 0; // batch id
+ uint group_id = 0; // group id
+ uint channel_id = 0; // channel id within the group
+
+ // Compute curr_channel and batch_id
+ DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
+
+ // Compute group_id and channel_id
+ DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
+
+ const uint x = get_global_id(0) * VEC_SIZE;
+ const uint y = get_global_id(1) * 2;
+ const uint z = channel_id * NUM_GROUPS + group_id;
+
+ // Load the Nx2 block
+ const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
+
+ // Store blocks
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
+ VSTORE(VEC_SIZE)
+ (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
+ VSTORE(VEC_SIZE)
+ (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
+}
+
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/depth_to_space.cl b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
new file mode 100644
index 0000000000..b9f223fe9d
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nchw(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % r;
+
+ const int out_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+ const int out_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, out_x, out_y, z, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/dequantization_layer.cl b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
new file mode 100644
index 0000000000..e0203f7408
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/dequantization_layer.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NCHW)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nchw(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ __global float *scale)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+
+ // Load data
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+ // Create scale vectors
+ const VEC_DATA_TYPE(float, VEC_SIZE)
+ vscale = scale[get_global_id(2)];
+
+ // Dequantize
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else // !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(2)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/direct_convolution1x1.cl b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
index 8ab2d1d4ea..8ab2d1d4ea 100644
--- a/src/core/CL/cl_kernels/direct_convolution1x1.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution1x1.cl
diff --git a/src/core/CL/cl_kernels/direct_convolution3x3.cl b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
index 811df053c4..811df053c4 100644
--- a/src/core/CL/cl_kernels/direct_convolution3x3.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution3x3.cl
diff --git a/src/core/CL/cl_kernels/direct_convolution5x5.cl b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
index 59d668f0bf..59d668f0bf 100644
--- a/src/core/CL/cl_kernels/direct_convolution5x5.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution5x5.cl
diff --git a/src/core/CL/cl_kernels/direct_convolution_quantized.cl b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
index b80d4f587e..b80d4f587e 100644
--- a/src/core/CL/cl_kernels/direct_convolution_quantized.cl
+++ b/src/core/CL/cl_kernels/nchw/direct_convolution_quantized.cl
diff --git a/src/core/CL/cl_kernels/im2col.cl b/src/core/CL/cl_kernels/nchw/im2col.cl
index a1467a0b36..fddf918c63 100644
--- a/src/core/CL/cl_kernels/im2col.cl
+++ b/src/core/CL/cl_kernels/nchw/im2col.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -22,7 +22,6 @@
* SOFTWARE.
*/
#include "helpers.h"
-
#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
#if ELEMENT_SIZE == 1
@@ -861,500 +860,4 @@ __kernel void im2col_generic_padx0_pady0_nchw(
#endif // HAS_BIAS
}
#endif //defined(CONVOLVED_WIDTH) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(PAD_RIGHT) && defined(PAD_BOTTOM) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(VECTOR_SIZE) && defined(WIDTH_MOD_VECTOR_SIZE)
-
-#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-
-#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
-#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
-
-/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
- * @name IM2COL1X9_NHWC_STORE
- *
- * @note To use this macro for a 3x3 block, @p ROW has to be 0
- *
- * @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
- * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
- * @param[in] DATA_TYPE Data type of @p DATA
- * @param[in] SRC_DEPTH Input channel size / depth
- * @param[in] DATA Value variable base name
- * @param[in] ROW The row number to store. Supported: 0-8
- * @param[in] OUTPUT_PTR Output pointer
- * @{
- */
-#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
- const bool at_channel_boundary = get_global_id(0) == 0; \
- if(at_channel_boundary) \
- { \
- IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
- } \
- else \
- { \
- IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
- }
-#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
- IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
-#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
-
-#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
- VSTORE(VECTOR_SIZE) \
- (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \
- VSTORE(VECTOR_SIZE) \
- (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-
-#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
- (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
-/** @}*/
-
-/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col3x3_nhwc(
- TENSOR3D_DECLARATION(src),
- IMAGE_DECLARATION(dst),
- uint src_stride_w,
- uint dst_stride_w)
-{
- // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
- const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
- const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
- const int yo = get_global_id(1);
- const int batch = get_global_id(2); // batch size
-
- // Calculate input indices
- const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
- const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
- // Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
- int yi_coord = 0;
- int3 offset = 0;
-
- // Clamp xi
- int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
- xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
- // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
- xi_offset *= (int3)src_stride_y;
-
- // Out-of-bound condition for X
- int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
-
- // yi == 0
- // Clamp yi
- // yi_coord is casted to unsigned int in order to use just a min() operation
- // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
- // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
- // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
- // also causes y_cond (y padding condition) to be satisfied
- yi_coord = yi - (int)PAD_TOP;
-
- // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
- yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
- // Compute offset
- offset = xi_offset + (yi_coord * (int)src_stride_z);
-
- // Load input values
- VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
- VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
- VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
- // Replace invalid values with PAD_VALUE
- int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
- values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
- values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
- values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
- // yi == 1
- // Clamp yi_coord (it can be negative if PAD_TOP > 1)
- yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
-
- // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
- yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
- // Compute offset
- offset = xi_offset + (yi_coord * (int)src_stride_z);
-
- // Load input values
- VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
- VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
- VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
- // Replace invalid values with zeros
- y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
- values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
- values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
- values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
- // yi == 2
- // Clamp yi_coord
- yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
-
- // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
-#if PAD_TOP != 0 || PAD_BOTTOM != 0
- yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
-#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
-
- // Compute offset
- offset = xi_offset + (yi_coord * (int)src_stride_z);
-
- // Load input values
- VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
- VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
- VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
- // Replace invalid values with PAD_VALUE
- y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
- values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
- values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
- values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
- // Store in a boundary-aware way to avoid padding
- IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
-
-#ifdef HAS_BIAS
- // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
- // added at the end of the channel, while the boundary vec is at the beginning of the channel.
- // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
- // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
- // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
- if((ch + VECTOR_SIZE) >= SRC_DEPTH)
- {
- *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
- }
-#endif // HAS_BIAS
-}
-
-#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i) \
- ({ \
- yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
- yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
- \
- offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
- offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
- \
- VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
- VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
- VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
- VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
- VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
- VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
- VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
- VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
- VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
- \
- int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \
- values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
- values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
- values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
- values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
- values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
- values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
- values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
- values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
- values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \
- \
- IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
- })
-#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-#define IM2COL1x9(i) \
- ({ \
- yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
- yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
- \
- offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
- offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
- \
- VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
- VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
- VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
- VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
- VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
- VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
- VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
- VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
- VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
- \
- IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
- })
-#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
-
-/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
- * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col9x9_nhwc(
- TENSOR3D_DECLARATION(src),
- IMAGE_DECLARATION(dst),
- uint src_stride_w,
- uint dst_stride_w)
-{
- // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
- const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
- const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
- const int yo = get_global_id(1);
- const int batch = get_global_id(2); // batch size
-
- // Calculate input indices
- const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
- const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
- // Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
- int yi_coord = 0;
- int8 offset0 = 0;
- int offset1 = 0;
-
- // Clamp xi
- int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
- int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
-
-#if PAD_LEFT != 0 || PAD_RIGHT != 0
-#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
- xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
- xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
-#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
- xi_offset0 *= (int8)src_stride_y;
- xi_offset1 *= (int)src_stride_y;
-
- // Out-of-bound condition for X
- int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
- int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
- IM2COL1x9(0);
- IM2COL1x9(1);
- IM2COL1x9(2);
- IM2COL1x9(3);
- IM2COL1x9(4);
- IM2COL1x9(5);
- IM2COL1x9(6);
- IM2COL1x9(7);
- IM2COL1x9(8);
-
-#ifdef HAS_BIAS
- // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
- // added at the end of the channel, while the boundary vec is at the beginning of the channel.
- // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
- // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
- // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
- if((ch + VECTOR_SIZE) >= SRC_DEPTH)
- {
- *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
- }
-#endif // HAS_BIAS
-}
-
-/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
- *
- * @note This kernel computes VECTOR_SIZE elements
- * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
- * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
- * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
- * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
- * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
- * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
- * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
- * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
- * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
- * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
- */
-__kernel void im2col_generic_nhwc(
- TENSOR3D_DECLARATION(src),
- IMAGE_DECLARATION(dst),
- uint src_stride_w,
- uint dst_stride_w)
-{
- // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
- const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
- const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
- const int yo = get_global_id(1);
- const int batch = get_global_id(2); // batch size
-
- // Calculate input indices
- const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
- const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
-
- // Get input and output address
- __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
-
- int i = 0;
- for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
- {
- // Clamp yi_coord
- int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
- yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
-
- // Out-of-bound condition for Y
- int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
-
- for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
- {
- // Clamp xi_coord
- int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
- xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
-
- // Out-of-bound condition for X
- int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
-
- int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
-
- VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
-
-#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
- // Replace with PAD_VALUE if the value is out-of-bound
- values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
-#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
-
- // Store in a boundary-aware way to avoid padding
-#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
- const bool at_channel_boundary = get_global_id(0) == 0;
- if(at_channel_boundary)
- {
- VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
- (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
- }
- else // at_channel_boundary
-#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
- {
- VSTORE(VECTOR_SIZE)
- (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
- }
- i++;
- }
- }
-
-#ifdef HAS_BIAS
- // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
- // added at the end of the channel, while the boundary vec is at the beginning of the channel.
- // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
- // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
- // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
- if((ch + VECTOR_SIZE) >= SRC_DEPTH)
- {
- *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
- }
-#endif // HAS_BIAS
-}
-#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
-#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalization_layer.cl b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
new file mode 100644
index 0000000000..0fef98e295
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalization_layer.cl
@@ -0,0 +1,175 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define MUL_OP(x, y) ((x) * (y))
+#define ADD_OP(x, y) ((x) + (y))
+#define DIV_OP(x, y) ((x) / (y))
+#define POW_OP(x, y) pow((x), (y))
+#define SQCVT_SAT(a) (a)
+
+#if defined(NUM_SLICES)
+/** Apply cross-map normalization.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
+
+ const int current_slice = get_global_id(2);
+ const int left_slice = max(-(int)RADIUS, -current_slice);
+ const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
+
+ for(int i = left_slice; i <= right_slice; i++)
+ {
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
+ acc = ADD_OP(acc, MUL_OP(values, values));
+ }
+
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+ VSTORE(VEC_SIZE)
+ (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif /* defined(NUM_SLICES) */
+
+#if defined(WIDTH_SIZE)
+/** Apply in-map normalization when tensors are in the NCHW data layout format.
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
+ * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
+ * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
+ * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ acc = 0;
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ coeff_v = SQCVT_SAT(COEFF);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_v = SQCVT_SAT(BETA);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ kappa_v = SQCVT_SAT(KAPPA);
+
+ const int current_col = get_global_id(0) << 2;
+ const int left_pos = max(-(int)RADIUS, -3 - current_col);
+ const int right_pos = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
+
+#if defined(IN_MAP_2D)
+ const int current_row = get_global_id(1);
+ const int first_row = max(-(int)RADIUS, -current_row);
+ const int last_row = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
+#endif /* defined(IN_MAP_2D) */
+
+#if defined(IN_MAP_2D)
+ for(int j = first_row; j <= last_row; ++j)
+ {
+#endif /* defined(IN_MAP_2D) */
+ for(int i = left_pos; i <= right_pos; ++i)
+ {
+#if defined(IN_MAP_2D)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
+#else /* defined(IN_MAP_2D) */
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
+#endif /* defined(IN_MAP_2D) */
+ acc = ADD_OP(acc, MUL_OP(values, values));
+ }
+#if defined(IN_MAP_2D)
+ }
+#endif /* defined(IN_MAP_2D) */
+
+ acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized = POW_OP(acc, beta_v);
+ const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
+
+ VSTORE(VEC_SIZE)
+ (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(WIDTH_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
new file mode 100644
index 0000000000..23a0de76f7
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer.cl
@@ -0,0 +1,82 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+ const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
+ const DATA_TYPE curr_std = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
+
+ TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
+ TYPE res = (data - curr_mean) / curr_std;
+
+ VSTORE(VEC_SIZE)
+ (res, 0, (__global DATA_TYPE *)dst.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
new file mode 100644
index 0000000000..0f02ef6184
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/normalize_planar_yuv_layer_quantized.cl
@@ -0,0 +1,101 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+
+#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#define OFFSET_FLT ((float)OFFSET)
+#define SCALE_FLT ((float)SCALE)
+
+#if defined(NUM_CHANNELS)
+
+/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
+ *
+ * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
+ * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
+ * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
+ * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
+ *
+ * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
+ * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
+ * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ */
+__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst),
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(std))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+ Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
+ Vector std = CONVERT_TO_VECTOR_STRUCT(std);
+
+ const uint current_slice = get_global_id(2) % NUM_CHANNELS;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
+ curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
+ curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
+
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
+ data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
+
+ // Perform normalization
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
+
+ const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
+ VSTORE(VEC_SIZE)
+ (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
+}
+
+#endif // defined(NUM_CHANNELS)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer.cl b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
new file mode 100644
index 0000000000..790ddb381a
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer.cl
@@ -0,0 +1,331 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(FP_MIXED_PRECISION)
+#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
+ CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
+#else /* defined(FP_MIXED_PRECISION) */
+#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
+#endif /* defined(FP_MIXED_PRECISION) */
+
+ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(0) * stride_x - pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+/** Performs a pooling function of pool size equal to N (NCHW)
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nchw(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+ vdata = INITIAL_VALUE;
+ ACC_DATA_TYPE sdata = INITIAL_VALUE;
+
+ // Load data
+ for(int y = 0; y < POOL_SIZE_Y; y++)
+ {
+ int x = 0;
+ for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+ {
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
+ data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < (int)POOL_SIZE_X; ++x)
+ {
+ ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif /* defined(POOL_L2) */
+ sdata = POOL_OP(sdata, data0);
+ }
+ }
+
+ // Reduce result
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
+ reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+ VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
+ reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
+ res = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ // Divide by pool region in case of average pooling
+ res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res = SQRT_OP(res);
+#endif /* defined(POOL_L2) */
+
+ // Store result
+ *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
+{
+ const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
+ const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
+
+ const int x = get_global_id(0) * STRIDE_X;
+ const int y = get_global_id(1) * STRIDE_Y;
+ const int z = get_global_id(2);
+
+ //x axis: width, y axis: height, z axis: component
+ const uint padded_offset = input->offset_first_element_in_bytes
+ + x * input->stride_x
+ + y * input->stride_y
+ + z * input->stride_z;
+
+ const uint offset_base = padded_offset
+ - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */
+ - PAD_TENSOR_TOP * input->stride_y /* top padding */
+ - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
+ - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
+
+#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
+ *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
+#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+ *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
+#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
+
+ *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
+
+ return;
+}
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
+ * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp32(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ TENSOR3D_DECLARATION(indices))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+ // Load data
+ float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
+ float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
+
+ // Perform calculations
+ float data0_max = POOL_OP(data0.s0, data0.s1);
+ float data1_max = POOL_OP(data1.s0, data1.s1);
+ float res = POOL_OP(data0_max, data1_max);
+ // Store result
+ *(__global float *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+ uint offset_top = 0;
+ uint offset_bottom = 0;
+
+ offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+ uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+ uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+ uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+ *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+}
+
+/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
+ *
+ * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
+ * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
+ * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2_nchw_indices_fp16(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ TENSOR3D_DECLARATION(indices))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
+
+ // Load data
+ half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
+ half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
+
+ // Perform calculations
+ half data0_max = POOL_OP(data0.s0, data0.s1);
+ half data1_max = POOL_OP(data1.s0, data1.s1);
+ half res = POOL_OP(data0_max, data1_max);
+ // Store result
+ *(__global half *)output.ptr = res;
+
+#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+
+ uint offset_top = 0;
+ uint offset_bottom = 0;
+
+ offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
+
+ uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
+ uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
+ uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
+
+ *(__global uint *)indices.ptr = index;
+
+#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
new file mode 100644
index 0000000000..1440ef3ed1
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/pooling_layer_quantized.cl
@@ -0,0 +1,142 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(INITIAL_VALUE)
+#define VEC_TYPE(VEC_SIZE) VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+
+#if defined(POOL_AVG)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) */
+#define POOL_OP(x, y) (max((x), (y)))
+#endif /* defined(POOL_AVG) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+
+#if defined(POOL_L2)
+#error "L2 pooling is not supported"
+#endif /* defined(POOL_L2) */
+
+int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
+ const int pad_x, const int pad_y, const int stride_x, const int stride_y)
+{
+ int start_x = get_global_id(0) * stride_x - pad_x;
+ int start_y = get_global_id(1) * stride_y - pad_y;
+ const int end_x = min(start_x + pool_size_x, upper_bound_w);
+ const int end_y = min(start_y + pool_size_y, upper_bound_h);
+#if defined(EXCLUDE_PADDING)
+ start_x = max(0, start_x);
+ start_y = max(0, start_y);
+#endif /* defined(EXCLUDE_PADDING) */
+ return ((end_y - start_y) * (end_x - start_x));
+}
+
+/** Performs a pooling function of pool size equal to N (NCHW)
+ *
+ * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
+ * @note In case of average pooling the following information must be passed at compile time:
+ * -DPOOL_AVG must be provided otherwise max pooling will be performed.
+ * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
+ * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
+ * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
+ * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void pooling_layer_MxN_quantized_nchw(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output))
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int8 vdata = INITIAL_VALUE;
+ int sdata = INITIAL_VALUE;
+
+ // Load data
+ for(int y = 0; y < POOL_SIZE_Y; y++)
+ {
+ int x = 0;
+ for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
+ {
+ VEC_TYPE(8)
+ data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+ int8 data0 = convert_int8(data);
+ vdata = POOL_OP(vdata, data0);
+ }
+
+ // Leftover
+ for(; x < (int)POOL_SIZE_X; ++x)
+ {
+ DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
+ int data0 = convert_int(data);
+ sdata = POOL_OP(sdata, data0);
+ }
+ }
+
+ // Reduce result
+ int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
+ int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
+ int res = POOL_OP(reduce2.s0, reduce2.s1);
+ res = POOL_OP(res, sdata);
+
+#if defined(POOL_AVG)
+ res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
+#endif /* defined(POOL_AVG) */
+
+ DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
+
+#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
+
+ const float result_f32 = convert_float(result_q8);
+ const float input_offset = (float)OFFSET_IN1;
+ const float input_scale = (float)SCALE_IN1;
+ const float scale_out = (float)SCALE_OUT;
+ const float offset_out = (float)OFFSET_OUT;
+ const float in_f32 = (result_f32 - input_offset) * input_scale;
+ const float out_f32 = in_f32 / scale_out + offset_out;
+ result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
+
+#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
+
+ *(__global DATA_TYPE *)output.ptr = result_q8;
+}
+#endif // defined(DATA_TYPE) && defined(INITIAL_VALUE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/prior_box_layer.cl b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
index de10decdec..7524ba7b4a 100644
--- a/src/core/CL/cl_kernels/prior_box_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/prior_box_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
diff --git a/src/core/CL/cl_kernels/nchw/remap.cl b/src/core/CL/cl_kernels/nchw/remap.cl
new file mode 100644
index 0000000000..fab88a1682
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/remap.cl
@@ -0,0 +1,133 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+#ifndef DEPTH_OUT
+/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ * out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8.
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] width Width of the input image
+ * @param[in] height Height of the input image
+ */
+__kernel void remap_nearest_neighbour_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ IMAGE_DECLARATION(mapx),
+ IMAGE_DECLARATION(mapy),
+ const float width,
+ const float height)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+ Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+ float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+ float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+ float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+ mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+ vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
+}
+
+/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
+ *
+ * This kernel performs remapping with this method of pixel coordinate translation:
+ * out(x,y) = in(mapx(x,y), mapy(x,y));
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8.
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32.
+ * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes)
+ * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes)
+ * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes)
+ * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
+ * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
+ * @param[in] width Width of the input image
+ * @param[in] height Height of the input image
+ */
+__kernel void remap_bilinear_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ IMAGE_DECLARATION(mapx),
+ IMAGE_DECLARATION(mapy),
+ const float width,
+ const float height)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
+ Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
+
+ float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
+ float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
+ float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
+ mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
+
+ vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
+}
+#endif // DEPTH_OUT \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/reorg_layer.cl b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
index 29344de37a..f66b17c1a6 100644
--- a/src/core/CL/cl_kernels/reorg_layer.cl
+++ b/src/core/CL/cl_kernels/nchw/reorg_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -72,45 +72,4 @@ __kernel void reorg_layer_nchw(
int src_offset = xi * sizeof(DATA_TYPE) + yi * src_stride_y + zi * src_stride_z;
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
}
-
-/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
- *
- * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
- * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
- * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void reorg_layer_nhwc(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
- int xo = get_global_id(1);
- int yo = get_global_id(2);
- int zo = get_global_id(0);
- int xi, yi, zi;
-
- CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
-
- int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
-}
#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale.cl b/src/core/CL/cl_kernels/nchw/scale.cl
new file mode 100644
index 0000000000..63a53cc4f2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale.cl
@@ -0,0 +1,148 @@
+/*
+ * Copyright (c) 2016-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "warp_helpers.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_nearest(const float2 coord, const float2 scale)
+{
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+ const float4 new_x = in_x_coords * (float4)(scale.s0);
+ const float4 new_y = (float4)(coord.s1 * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+ const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
+ const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear(const float2 coord, const float2 scale)
+{
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float4 new_x = in_x_coords * (float4)(scale.s0);
+ const float4 new_y = (float4)(coord.s1 * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+ const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+ const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_nearest_neighbour_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ const float2 r = (float2)(scale_x, scale_y);
+ float8 transformed = transform_nearest(get_current_coords(), r);
+#ifdef ALIGN_CORNERS
+ transformed = round(transformed);
+#endif // ALIGN_CORNERS
+ const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
+ vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_bilinear_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ const float2 r = (float2)(scale_x, scale_y);
+ const float8 tc = transform_bilinear(get_current_coords(), r);
+ vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/scale_quantized.cl b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
new file mode 100644
index 0000000000..946ad65c14
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/scale_quantized.cl
@@ -0,0 +1,86 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers_asymm.h"
+#include "warp_helpers_quantized.h"
+
+/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
+ *
+ * @param[in] coord 2D coordinates to transform.
+ * @param[in] scale input/output scale ratio
+ *
+ * @return a float8 containing 4 2D transformed values in the input image.
+ */
+inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
+{
+ const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
+#ifdef SAMPLING_POLICY_TOP_LEFT
+ const float4 new_x = in_x_coords * (float4)(scale.s0);
+ const float4 new_y = (float4)(coord.s1 * scale.s1);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#elif SAMPLING_POLICY_CENTER
+ const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
+ const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
+ return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
+#else /* SAMPLING_POLICY */
+#error("Unsupported sampling policy");
+#endif /* SAMPLING_POLICY */
+}
+
+/** Performs an affine transformation on an image interpolating with the BILINEAR method.
+ *
+ * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
+ * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
+ * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
+ * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
+ * @param[in] input_width Input image width
+ * @param[in] input_height Input image height
+ * @param[in] scale_x The scale factor along x dimension
+ * @param[in] scale_y The scale factor along y dimension
+ */
+__kernel void scale_bilinear_quantized_nchw(
+ IMAGE_DECLARATION(in),
+ IMAGE_DECLARATION(out),
+ const float input_width,
+ const float input_height,
+ const float scale_x,
+ const float scale_y)
+{
+ Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
+ Image out = CONVERT_TO_IMAGE_STRUCT(out);
+ const float2 r = (float2)(scale_x, scale_y);
+ const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r);
+ vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/space_to_batch.cl b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
index cb11786ac4..e162a29bb0 100644
--- a/src/core/CL/cl_kernels/space_to_batch.cl
+++ b/src/core/CL/cl_kernels/nchw/space_to_batch.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -93,75 +93,7 @@ __kernel void space_to_batch_nchw(
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
}
}
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32
- * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes)
- * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes)
- * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image
- * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32
- * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes)
- * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
- * @param[in] batch_id The output tensor batch id
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_nhwc(
- TENSOR4D_DECLARATION(input),
- IMAGE_DECLARATION(paddings),
- VECTOR_DECLARATION(block_shape),
- const int batch_id,
- TENSOR3D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
- Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- const int pad_left_x = *((__global int *)offset(&pad, 0, 0));
- const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
- const int pad_left_y = *((__global int *)offset(&pad, 0, 1));
- const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
-
- int block_x = *((__global int *)vector_offset(&block, 0));
- int block_y = *((__global int *)vector_offset(&block, 1));
-
- const int out_x = get_global_id(1);
- const int out_y = get_global_id(2);
- const int z = get_global_id(0);
-
- const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
- const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
- if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
- {
- const int w = batch_id % BATCH_IN;
- const int in_x = pos_x - pad_left_x;
- const int in_y = pos_y - pad_left_y;
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
- }
-}
#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
@@ -221,60 +153,4 @@ __kernel void space_to_batch_static_nchw(
*((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, w));
}
}
-/** Calculate the space to batch conversion. (NHWC)
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
- * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
- * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
- * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
- * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
- * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
- * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
- * @param[in] batch_id The output tensor batch id
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void space_to_batch_static_nhwc(
- TENSOR4D_DECLARATION(input),
- const int batch_id,
- TENSOR3D_DECLARATION(output))
-{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- int block_x = BLOCK_SHAPE_X;
- int block_y = BLOCK_SHAPE_Y;
-
- const int out_x = get_global_id(1);
- const int out_y = get_global_id(2);
- const int z = get_global_id(0);
-
- const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
- const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
-
- if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
- {
- const int w = batch_id % BATCH_IN;
- const int in_x = pos_x - PAD_LEFT_X;
- const int in_y = pos_y - PAD_LEFT_Y;
-
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
- }
-}
#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
diff --git a/src/core/CL/cl_kernels/nchw/space_to_depth.cl b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
new file mode 100644
index 0000000000..aea02e813b
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NCHW)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nchw(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(0);
+ const int y = get_global_id(1);
+ const int z = get_global_id(2) % r;
+
+ const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
+ const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/upsample_layer.cl b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
new file mode 100644
index 0000000000..723c491165
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/upsample_layer.cl
@@ -0,0 +1,79 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+/** This function applies upsample on an input image. (NCHW)
+ *
+ * @attention The following variables must be passed at compile time:
+ * -# -DDATA_TYPE = Tensor data type. Supported data types: All
+ * -# -DVEC_SIZE_IN = Input vector size
+ * -# -DVEC_SIZE_OUT = Output vector size
+ * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
+ * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: All
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void upsample_layer_nchw(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
+ Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN);
+ const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
+ src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
+ dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ data = vload8(0, (__global DATA_TYPE *)src.ptr);
+
+ VEC_DATA_TYPE(DATA_TYPE, 16)
+ data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
+
+ vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
+ vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
+#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
+ *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
+#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
new file mode 100644
index 0000000000..85eff9e6d9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nchw/winograd_filter_transform.cl
@@ -0,0 +1,911 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(SRC_DIM_Z)
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x2_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out0 = 0.0f;
+ out0.s0 = (w0.s0);
+ out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
+ out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
+ out0.s3 = (w0.s2);
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out1 = 0.0f;
+ out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
+ out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out2 = 0.0f;
+ out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
+ out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
+ out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
+ out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ out3 = 0.0f;
+ out3.s0 = (w2.s0);
+ out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
+ out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
+ out3.s3 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int z = get_global_id(2);
+ int x0 = z / SRC_DIM_Z; // idx filter
+ int y0 = z % SRC_DIM_Z; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ // 16 channels for 3x3 kernels
+ // 4 channels for 3x1 or 1x3 kernels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_3x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ VEC_DATA_TYPE(DATA_TYPE, 3)
+ w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = (w0.s0) / 16.f;
+ out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
+ out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
+ out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
+ out0.s5 = (w0.s2) / 4.f;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
+ out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
+ out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
+ out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
+ out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
+ out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
+ out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+ // Row 4
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
+ out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
+ out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
+ out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
+
+ // Row 5
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (w2.s0) / 4.f;
+ out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
+ out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
+ out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
+ out5.s5 = (w2.s2);
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int z = get_global_id(2);
+ int x0 = z / SRC_DIM_Z; // idx filter
+ int y0 = z % SRC_DIM_Z; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ // 36 channels for 3x3 kernels
+ // 6 channels for 3x1 or 1x3 kernels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ *
+ * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x4_5x5_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
+
+ const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
+
+ // Load the values from the input tensor
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
+ *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
+ DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
+ DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
+ DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
+ DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
+ VEC_DATA_TYPE(DATA_TYPE, 4)
+ w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
+ DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+ // Transform the input tile
+
+ // Row 0
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out0 = 0.0f;
+ out0.s0 = w00.s0;
+ out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
+ out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
+ out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
+ out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
+ out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
+ out0.s7 = w01;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ // Row 1
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out1 = 0.0f;
+ out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
+ out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
+ out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
+ out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
+ (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
+ out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
+
+ // Row 2
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out2 = 0.0f;
+ out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
+ out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
+ out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
+ out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
+ (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
+ out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
+
+ // Row 3
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out3 = 0.0f;
+ out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
+ out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
+ out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
+ out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
+
+ // Row 4
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out4 = 0.0f;
+ out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
+ out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
+ out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
+ out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
+ (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
+ (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
+ out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
+
+ // Row 5
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out5 = 0.0f;
+ out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
+ out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
+ out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
+ out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
+ out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
+
+ // Row 6
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out6 = 0.0f;
+ out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
+ out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
+ out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
+ out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
+ (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
+ (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
+ out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
+
+ // Row 7
+ VEC_DATA_TYPE(DATA_TYPE, 8)
+ out7 = 0.0f;
+ out7.s0 = w40.s0;
+ out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
+ out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
+ out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
+ out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
+ out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
+ out7.s7 = w41;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+
+ int z = get_global_id(2);
+ int x0 = z / SRC_DIM_Z; // idx filter
+ int y0 = z % SRC_DIM_Z; // idx channel
+
+ // Get output address
+ __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
+
+ // Store the values across the channels
+ *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
+ *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
+ *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
+ *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
+ *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
+ *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
+ *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
+ *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
+
+#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+ *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
+ *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
+ *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
+ *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
+ *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
+ *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
+ *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
+ *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
+ *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
+ *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
+ *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
+ *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
+ *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
+ *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
+ *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
+ *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
+ *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
+ *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
+ *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
+ *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
+ *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
+ *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
+ *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
+ *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
+ *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
+ *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
+ *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
+ *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
+ *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
+ *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
+ *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
+ *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
+ *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
+ *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
+ *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
+ *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
+ *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
+ *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
+ *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
+ *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
+ *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
+ *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
+ *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
+ *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
+ *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
+ *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
+ *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
+ *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
+ *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
+ *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
+ *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
+ *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
+ *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
+ *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
+ *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
+ *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
+#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+}
+
+#endif // defined(SRC_DIM_Z)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_2x1_3x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_3x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_4x1_5x1_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x2_1x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_2x2_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x3_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_3x3_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
+ *
+ * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
+ * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_filter_transform_1x4_1x5_nchw(
+ TENSOR4D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ winograd_filter_transform_4x4_5x5_nchw(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_offset_first_element_in_bytes);
+}
+
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
diff --git a/src/core/CL/cl_kernels/winograd_input_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
index fbb5e95196..8c382183c3 100644
--- a/src/core/CL/cl_kernels/winograd_input_transform.cl
+++ b/src/core/CL/cl_kernels/nchw/winograd_input_transform.cl
@@ -908,893 +908,6 @@ __kernel void winograd_input_transform_4x4_5x5_stepz1_nchw(
#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
}
-#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
- const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
- int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
- int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
- x -= PAD_LEFT;
- y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 6, 1, in);
- TILE(DATA_TYPE, 6, 1, out);
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- in[i].v = 0;
- })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
- TILE(DATA_TYPE, 6, 1, com);
-
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- in[i].v *= 4.0f;
- })
-
- com[0].v = in[2].v - 4.f * in[0].v;
- com[1].v = in[3].v - 4.f * in[1].v;
- com[2].v = in[4].v - 4.f * in[2].v;
- com[3].v = in[5].v - 4.f * in[3].v;
- com[4].v = in[3].v - in[1].v;
- com[4].v = com[4].v + com[4].v;
- com[5].v = in[4].v - in[2].v;
-
- out[0].v = com[2].v - com[0].v;
- out[1].v = com[2].v + com[1].v;
- out[2].v = com[2].v - com[1].v;
- out[3].v = com[5].v + com[4].v;
- out[4].v = com[5].v - com[4].v;
- out[5].v = com[3].v - com[1].v;
-
- TILE(uint, 6, 1, dst_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
- dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 6;
- })
-
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 36, 1, in);
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 36,
- {
- in[i].v = 0;
- })
-
- // Load the tile from a NHWC tensor
- T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
- TILE(DATA_TYPE, 6, 1, com);
- TILE(DATA_TYPE, 36, 1, tmp);
-
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
- com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
- com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
- com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
- com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;
- com[4].v = com[4].v + com[4].v;
- com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;
- tmp[i + 0 * 6].v = com[2].v - com[0].v;
- tmp[i + 1 * 6].v = com[2].v + com[1].v;
- tmp[i + 2 * 6].v = com[2].v - com[1].v;
- tmp[i + 3 * 6].v = com[5].v + com[4].v;
- tmp[i + 4 * 6].v = com[5].v - com[4].v;
- tmp[i + 5 * 6].v = com[3].v - com[1].v;
- })
-
- TILE(DATA_TYPE, 36, 1, out);
-
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- com[0].v = tmp[i * 6 + 2].v - 4.f * tmp[i * 6 + 0].v;
- com[1].v = tmp[i * 6 + 3].v - 4.f * tmp[i * 6 + 1].v;
- com[2].v = tmp[i * 6 + 4].v - 4.f * tmp[i * 6 + 2].v;
- com[3].v = tmp[i * 6 + 5].v - 4.f * tmp[i * 6 + 3].v;
- com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
- com[4].v = com[4].v + com[4].v;
- com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
- out[i * 6 + 0].v = com[2].v - com[0].v;
- out[i * 6 + 1].v = com[2].v + com[1].v;
- out[i * 6 + 2].v = com[2].v - com[1].v;
- out[i * 6 + 3].v = com[5].v + com[4].v;
- out[i * 6 + 4].v = com[5].v - com[4].v;
- out[i * 6 + 5].v = com[3].v - com[1].v;
- })
-
- // Compute destination address
- TILE(uint, 36, 1, dst_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 36,
- {
- dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
- dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 36;
- })
-
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
- const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
- int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
- int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
- x -= PAD_LEFT;
- y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 8, 1, in);
- TILE(DATA_TYPE, 8, 1, out);
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- in[i].v = 0;
- })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
- TILE(DATA_TYPE, 1, 8, com);
-
- com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
- com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
- com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
- com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
- com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
- com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
- out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
- out[1].s[0] = com[0].s[0] + com[0].s[1];
- out[2].s[0] = com[0].s[0] - com[0].s[1];
- out[3].s[0] = com[0].s[3] + com[0].s[2];
- out[4].s[0] = com[0].s[3] - com[0].s[2];
- out[5].s[0] = com[0].s[4] + com[0].s[5];
- out[6].s[0] = com[0].s[4] - com[0].s[5];
- out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
-
- TILE(uint, 8, 1, dst_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
- dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
- })
-
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 64, 1, in);
- TILE(DATA_TYPE, 64, 1, out);
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- in[i].v = 0;
- })
-
- // Load the tile from a NHWC tensor
- T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
- TILE(DATA_TYPE, 8, 8, com);
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
- com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x
- com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
- com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
- com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
- com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
- com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
- com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
- })
-
- TILE(DATA_TYPE, 8, 8, tmp);
- tmp[0].v = com[6].v;
- tmp[1].v = com[0].v + com[1].v;
- tmp[2].v = com[0].v - com[1].v;
- tmp[3].v = com[2].v + com[3].v;
- tmp[4].v = com[2].v - com[3].v;
- tmp[5].v = com[4].v + com[5].v;
- tmp[6].v = com[4].v - com[5].v;
- tmp[7].v = com[7].v;
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
- com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
- com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
- com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
- com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
- com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
- out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
- out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
- out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
- out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
- out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
- out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
- out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
- out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
- })
-
- TILE(uint, 64, 1, dst_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
- dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
- })
-
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
- const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
- // All the tensor dimensions are passed at compile time.
- // In case of dynamic tensor support, the following dimensions should be passed as function argument.
-#define _ISRC_WIDTH SRC_WIDTH
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-#define _INUM_TILES_Y NUM_TILES_Y
-
- int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
- int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
- x -= PAD_LEFT;
- y -= PAD_TOP;
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 8, 1, in);
- TILE(DATA_TYPE, 8, 1, out);
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- in[i].v = 0;
- })
-
-#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
- T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- in[i].v *= (DATA_TYPE) - 36.0f;
- })
-
- TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
-
- com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
- com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
- com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
- com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
- com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
- com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
- out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
- out[1].s[0] = com[0].s[0] - com[0].s[1];
- out[2].s[0] = com[0].s[0] + com[0].s[1];
- out[3].s[0] = com[0].s[2] - com[0].s[3];
- out[4].s[0] = com[0].s[2] + com[0].s[3];
- out[5].s[0] = com[0].s[4] - com[0].s[5];
- out[6].s[0] = com[0].s[4] + com[0].s[5];
- out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
-
- TILE(uint, 8, 1, dst_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
- dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 8;
- })
-
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 64, 1, in);
- TILE(DATA_TYPE, 64, 1, out);
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- in[i].v = 0;
- })
-
- // Load the tile from a NHWC tensor
- T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
-
- TILE(DATA_TYPE, 8, 8, com);
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
- com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
- com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
- com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
- com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
- com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
- com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
- com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
- })
-
- TILE(DATA_TYPE, 8, 8, tmp);
- tmp[0].v = com[6].v;
- tmp[1].v = com[0].v - com[1].v;
- tmp[2].v = com[0].v + com[1].v;
- tmp[3].v = com[2].v - com[3].v;
- tmp[4].v = com[2].v + com[3].v;
- tmp[5].v = com[4].v - com[5].v;
- tmp[6].v = com[4].v + com[5].v;
- tmp[7].v = com[7].v;
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
- com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
- com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
- com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
- com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
- com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
- out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
- out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
- out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
- out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
- out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
- out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
- out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
- out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
- })
-
- TILE(uint, 64, 1, dst_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- dst_indirect_y[i].v = mout + i * _INUM_TILES_X * _INUM_TILES_Y;
- dst_indirect_y[i].v += bout * _INUM_TILES_X * _INUM_TILES_Y * 64;
- })
-
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes);
-}
-
-//! @cond Doxygen_Suppress
-/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
- *
- * @note Data layout supported: NHWC
- * @note Data type supported: F32/F16
- * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
- * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
- * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
- * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-//! @endcond
-__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER))
-{
- winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes);
-}
-#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
-
#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 2x1
*
diff --git a/src/core/CL/cl_kernels/winograd_output_transform.cl b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
index 6a3e6d3346..861ed50651 100644
--- a/src/core/CL/cl_kernels/winograd_output_transform.cl
+++ b/src/core/CL/cl_kernels/nchw/winograd_output_transform.cl
@@ -176,181 +176,6 @@ __kernel void winograd_output_transform_2x2_3x3_nchw(
(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x2_7x7_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
-#define _ISRC_HEIGHT SRC_HEIGHT
-#define _IDST_WIDTH DST_WIDTH
-#define _IDST_HEIGHT DST_HEIGHT
-#define _INUM_TILES_X NUM_TILES_X
-
- const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
- const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
- int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
- int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- TILE(DATA_TYPE, 8, N0, in);
- TILE(DATA_TYPE, 2, N0, out);
- TILE(uint, 8, 1, src_indirect_y);
-
- // Calculate the indirect Y for the source tensor
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
- src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
- })
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- in[i].v = 0;
- })
-
- // Load the values across the 8 channels to compose the 8x1 tile
- T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
- // Compute out0 and out01
- out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
- out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
- // Add bias
- TILE(DATA_TYPE, 1, N0, b);
-
- T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
- T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
- T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
- TILE(uint, 2, 1, dst_indirect_y);
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- LOOP_UNROLLING(int, yk, 0, 1, 2,
- {
- int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
- dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
- })
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- LOOP_UNROLLING(int, xk, 0, 1, 2,
- {
- int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
- dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
- })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
- // Store the tile in reverse order so the invalid values are overwritten with the valid ones
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 64, N0, in);
- TILE(DATA_TYPE, 4, N0, out);
- TILE(DATA_TYPE, 16, N0, tmp);
- TILE(uint, 64, 1, src_indirect_y);
-
- // Calculate the indirect Y for the source tensor
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- src_indirect_y[i].v = mout + i * _ISRC_HEIGHT;
- src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
- })
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- in[i].v = 0;
- })
-
- // Load the values across the 64 channels to compose the 8x8 tile
- T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
- tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
- })
-
- // Compute the 2x2 output tile
- LOOP_UNROLLING(int, i, 0, 1, 2,
- {
- out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
- out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
- })
-
-#if defined(HAS_BIAS)
- // Add bias
- TILE(DATA_TYPE, 1, N0, b);
-
- T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
- T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // defined(HAS_BIAS)
-
- T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
- TILE(uint, 4, 1, dst_indirect_y);
-
- // Calculate the destination indirect Y
- LOOP_UNROLLING(int, yk, 0, 1, 2,
- {
- LOOP_UNROLLING(int, xk, 0, 1, 2,
- {
- int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
- int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
- dst_indirect_y[xk + yk * 2].v = x_c + y_c * _IDST_WIDTH;
- dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
- })
- })
-
- // Store the tile in reverse order so the invalid values are overwritten with the valid ones
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -577,200 +402,6 @@ __kernel void winograd_output_transform_4x4_3x3_nchw(
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] dst_size Size of the destination tensor, minus the last padding
- */
-__kernel void winograd_output_transform_4x4_3x3_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
- const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
- TILE(DATA_TYPE, 6, N0, in);
- TILE(DATA_TYPE, 4, N0, out);
- TILE(uint, 6, 1, src_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- src_indirect_y[i].v = mout + i * SRC_HEIGHT;
- src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
- })
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- in[i].v = 0;
- })
-
- // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
- T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
- // Compute out00, out01, out02 and out03
- out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
- out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
- out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
- out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
-
-#if defined(HAS_BIAS)
- TILE(DATA_TYPE, 1, N0, b);
-
- T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
- // c = c + bias[broadcasted]
- T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
- int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
- int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
- T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
- TILE(uint, 4, 1, dst_indirect_y);
-
- // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- LOOP_UNROLLING(int, yk, 0, 1, 4,
- {
- int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
- dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
- dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
- })
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- LOOP_UNROLLING(int, xk, 0, 1, 4,
- {
- int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
- dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
- dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
- })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
- // Store the tile in reverse order so the invalid values are overwritten with the valid ones
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
- // Calculate the indirect Y for the source tensor
- TILE(DATA_TYPE, 36, N0, in);
- TILE(DATA_TYPE, 4, N0, tmp);
- TILE(uint, 36, 1, src_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 36,
- {
- src_indirect_y[i].v = mout + i * SRC_HEIGHT;
- src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
- })
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 36,
- {
- in[i].v = 0;
- })
-
- // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
- T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
- LOOP_UNROLLING(int, i, 0, 1, 6,
- {
- tmp[0].v = in[6 + i].v + in[12 + i].v;
- tmp[1].v = in[6 + i].v - in[12 + i].v;
- tmp[2].v = in[18 + i].v + in[24 + i].v;
- tmp[3].v = in[18 + i].v - in[24 + i].v;
- tmp[3].v = tmp[3].v + tmp[3].v;
- in[i].v = in[i].v + tmp[0].v + tmp[2].v;
- in[6 + i].v = tmp[3].v + tmp[1].v;
- in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
- in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
- })
-
- // Compute the output tile
- TILE(DATA_TYPE, 16, N0, out);
-
- LOOP_UNROLLING(int, i, 0, 1, 4,
- {
- tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
- tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
- tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
- tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
- tmp[3].v = tmp[3].v + tmp[3].v;
- out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
- out[4 * i + 1].v = tmp[3].v + tmp[1].v;
- out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
- out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
- })
-
-#if defined(HAS_BIAS)
- TILE(DATA_TYPE, 1, N0, b);
-
- T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
- // c = c + bias[broadcasted]
- T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
- int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
- int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
- T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
- TILE(uint, 16, 1, dst_indirect_y);
-
- // Calculate the destination indirect Y
- LOOP_UNROLLING(int, yk, 0, 1, 4,
- {
- LOOP_UNROLLING(int, xk, 0, 1, 4,
- {
- int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
- int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
- dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
- dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
- })
- })
-
- // Store the tile in reverse order so the invalid values are overwritten with the valid ones
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
-
#define COMPUTE_TMP_COL(col, d0, d1, d2, d3, d4, d5, d6, d7, comm_fact) \
({ \
comm_fact.s0 = d1 + d2; \
@@ -1023,214 +654,6 @@ __kernel void winograd_output_transform_4x4_5x5_nchw(
0, (__global DATA_TYPE *)(dst_addr + 3 * dst_stride_y));
#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x4_5x5_nhwc(
- TENSOR4D(src, BUFFER),
- TENSOR4D(dst, BUFFER),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
- const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
- const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
-
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- TILE(DATA_TYPE, 8, N0, in);
- TILE(DATA_TYPE, 4, N0, out);
- TILE(DATA_TYPE, 4, N0, tmp);
- TILE(uint, 8, 1, src_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- src_indirect_y[i].v = mout + i * SRC_HEIGHT;
- src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
- })
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- in[i].v = 0;
- })
-
- // "in" contains 1x8 or 8x1 tile here
- T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
- // A^T * in, and in this degenerate case out consists of 1 column/row
- tmp[0].v = in[1].v - in[2].v;
- tmp[1].v = 2.0f * (in[3].v - in[4].v);
- tmp[2].v = 2.0f * (in[5].v + in[6].v);
- tmp[3].v = in[3].v + in[4].v;
- out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
- out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
- out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
- out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
-
-#if defined(HAS_BIAS)
- TILE(DATA_TYPE, 1, N0, b);
-
- T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
- // c = c + bias[broadcasted]
- T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
-#endif // HAS_BIAS
-
- int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
- int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
- T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
- TILE(uint, 4, 1, dst_indirect_y);
-
- // Calculate the destination indirect Y
-#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- LOOP_UNROLLING(int, yk, 0, 1, 4,
- {
- int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
- dst_indirect_y[yk].v = x_out + y_c * DST_WIDTH;
- dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
- })
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- LOOP_UNROLLING(int, xk, 0, 1, 4,
- {
- int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
- dst_indirect_y[xk].v = x_c + y_out * DST_WIDTH;
- dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
- })
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-
- // Store the tile in reverse order so the invalid values are overwritten with the valid ones
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-
-#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
- // Calculate the indirect Y for the source tensor
- TILE(DATA_TYPE, 64, N0, in);
- TILE(DATA_TYPE, 6, N0, tmp);
- TILE(uint, 64, 1, src_indirect_y);
-
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- src_indirect_y[i].v = mout + i * SRC_HEIGHT;
- src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
- })
-
- // Initialize the input tile
- LOOP_UNROLLING(int, i, 0, 1, 64,
- {
- in[i].v = 0;
- })
-
- // "in" here is 8x8 tile
- T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
-
- // A^T * in
- LOOP_UNROLLING(int, i, 0, 1, 8,
- {
- tmp[0].v = in[8 + i].v + in[16 + i].v;
- tmp[1].v = in[8 + i].v - in[16 + i].v;
- tmp[2].v = in[24 + i].v + in[32 + i].v;
- tmp[3].v = in[24 + i].v - in[32 + i].v;
- tmp[3].v = tmp[3].v + tmp[3].v;
- tmp[4].v = in[40 + i].v + in[48 + i].v;
- tmp[4].v = tmp[4].v + tmp[4].v;
- tmp[5].v = in[40 + i].v - in[48 + i].v;
-
- // 4x8 matrix as a result
- in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
- in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
- in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
- in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
- })
-
- // Compute the output tile
- TILE(DATA_TYPE, 16, N0, out);
-
- // in * A, with in = A^T * in as above
- LOOP_UNROLLING(int, i, 0, 1, 4,
- {
- tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
- tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
- tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
- tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
- tmp[3].v = tmp[3].v + tmp[3].v;
- tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
- tmp[4].v = tmp[4].v + tmp[4].v;
- tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
-
- // 4x4 tile
- out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
- out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
- out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
- out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
- })
-
-#if defined(HAS_BIAS)
- TILE(DATA_TYPE, 1, N0, b);
-
- T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
-
- // c = c + bias[broadcasted]
- T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
-#endif // HAS_BIAS
-
- int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
- int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
-
- T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
-
- TILE(uint, 16, 1, dst_indirect_y);
-
- // Calculate the destination indirect Y
- LOOP_UNROLLING(int, yk, 0, 1, 4,
- {
- LOOP_UNROLLING(int, xk, 0, 1, 4,
- {
- int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
- int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
- dst_indirect_y[xk + yk * 4].v = x_c + y_c * DST_WIDTH;
- dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
- })
- })
-
- // Store the tile in reverse order so the invalid values are overwritten with the valid ones
- T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
-#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
-}
#endif // defined(VEC_SIZE) && VEC_SIZE == 4
#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
@@ -1303,73 +726,6 @@ __kernel void winograd_output_transform_2x1_3x1_nchw(
);
}
-/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_2x1_7x1_nhwc(
- TENSOR4D_DECLARATION(src),
- TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- winograd_output_transform_2x2_7x7_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
- bias_ptr,
- bias_stride_x,
- bias_step_x,
- bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
- dst_size);
-}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1509,141 +865,6 @@ __kernel void winograd_output_transform_4x1_5x1_nchw(
);
}
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_3x1_nhwc(
- TENSOR4D_DECLARATION(src),
- TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- winograd_output_transform_4x4_3x3_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
- bias_ptr,
- bias_stride_x,
- bias_step_x,
- bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
- dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_4x1_5x1_nhwc(
- TENSOR4D_DECLARATION(src),
- TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- winograd_output_transform_4x4_5x5_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
- bias_ptr,
- bias_stride_x,
- bias_step_x,
- bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
- dst_size);
-}
#endif // defined(VEC_SIZE) && VEC_SIZE == 4
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
@@ -1717,73 +938,6 @@ __kernel void winograd_output_transform_1x2_1x3_nchw(
);
}
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x2_1x7_nhwc(
- TENSOR4D_DECLARATION(src),
- TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- winograd_output_transform_2x2_7x7_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
- bias_ptr,
- bias_stride_x,
- bias_step_x,
- bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
- dst_size);
-}
#endif // defined(VEC_SIZE) && VEC_SIZE == 2
#if defined(VEC_SIZE) && VEC_SIZE == 4
@@ -1923,141 +1077,6 @@ __kernel void winograd_output_transform_1x4_1x5_nchw(
);
}
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x3_nhwc(
- TENSOR4D_DECLARATION(src),
- TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- winograd_output_transform_4x4_3x3_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
- bias_ptr,
- bias_stride_x,
- bias_step_x,
- bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
- dst_size);
-}
-
-/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
- *
- * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
- * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
- * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
- * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
- * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
- * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_output_transform_1x4_1x5_nhwc(
- TENSOR4D_DECLARATION(src),
- TENSOR4D_DECLARATION(dst),
-#if defined(HAS_BIAS)
- VECTOR_DECLARATION(bias),
-#endif // defined(HAS_BIAS)
- int dst_size)
-{
- winograd_output_transform_4x4_5x5_nhwc(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_stride_w,
- dst_step_w,
- dst_offset_first_element_in_bytes,
-#if defined(HAS_BIAS)
- bias_ptr,
- bias_stride_x,
- bias_step_x,
- bias_offset_first_element_in_bytes,
-#endif // defined(HAS_BIAS)
- dst_size);
-}
#endif // defined(VEC_SIZE) && VEC_SIZE == 4
#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
index 1217a37345..a5334525fe 100644
--- a/src/core/CL/cl_kernels/space_to_depth.cl
+++ b/src/core/CL/cl_kernels/nhwc/batch_to_space.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2019-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,12 +23,12 @@
*/
#include "helpers.h"
-#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
-/** Space to depth transformation. (NCHW)
+#if defined(DATA_TYPE) && defined(BATCH_SIZE)
+/** Batch to space transformation. (NHWC)
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
*
* @param[in] input_ptr Pointer to the source tensor. Supported data types: All
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -39,6 +39,12 @@
* @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
* @param[in] batch_id The input tensor batch id
+ * @param[in] block_shape_ptr Pointer to the source tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] block_shape_step_y block_shape_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
* @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
* @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
* @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
@@ -48,29 +54,39 @@
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void space_to_depth_nchw(
- TENSOR4D_DECLARATION(input),
+__kernel void batch_to_space_nhwc(
+ TENSOR3D_DECLARATION(input),
const int batch_id,
- TENSOR3D_DECLARATION(output))
+ VECTOR_DECLARATION(block_shape),
+ TENSOR4D_DECLARATION(output))
{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
- const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
- const int x = get_global_id(0);
- const int y = get_global_id(1);
- const int z = get_global_id(2) % r;
+ const int block_x = *((__global int *)vector_offset(&block, 0));
+ const int block_y = *((__global int *)vector_offset(&block, 1));
- const int in_x = x * BLOCK_SHAPE + (get_global_id(2) / r) % BLOCK_SHAPE;
- const int in_y = y * BLOCK_SHAPE + (get_global_id(2) / r) / BLOCK_SHAPE;
+ const int r = (BATCH_SIZE / (block_x * block_y));
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0);
+ const int w = batch_id % r;
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, in_x, in_y, z, batch_id));
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
}
-/** Space to depth transformation. (NHWC)
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE)
+
+#if defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y)
+/** Batch to space transformation. (NHWC)
*
* @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
- * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
*
* @param[in] input_ptr Pointer to the source tensor. Supported data types: All
* @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
@@ -90,22 +106,26 @@ __kernel void space_to_depth_nchw(
* @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
* @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
*/
-__kernel void space_to_depth_nhwc(
- TENSOR4D_DECLARATION(input),
+__kernel void batch_to_space_static_nhwc(
+ TENSOR3D_DECLARATION(input),
const int batch_id,
- TENSOR3D_DECLARATION(output))
+ TENSOR4D_DECLARATION(output))
{
- Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ const int block_x = BLOCK_SHAPE_X;
+ const int block_y = BLOCK_SHAPE_Y;
- const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int r = (BATCH_SIZE / (block_x * block_y));
const int x = get_global_id(1);
const int y = get_global_id(2);
- const int z = get_global_id(0) % r;
+ const int z = get_global_id(0);
+ const int w = batch_id % r;
- const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
- const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+ const int out_x = x * block_x + (batch_id / r) % block_x;
+ const int out_y = y * block_y + (batch_id / r) / block_x;
- *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+ *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, w)) = *((__global DATA_TYPE *)in.ptr);
}
-#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
+#endif // defined(DATA_TYPE) && defined(BATCH_SIZE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
new file mode 100644
index 0000000000..cb2da1bd99
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/batchnormalization_layer.cl
@@ -0,0 +1,146 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#define ADD_OP(a, b) ((a) + (b))
+#define SUB_OP(a, b) ((a) - (b))
+#define MUL_OP(a, b) ((a) * (b))
+#define INVSQRT_OP(a) rsqrt((a))
+#define SQCVT_SAT(a) (a)
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(ACTIVATION_TYPE)
+#include "activation_float_helpers.h"
+
+/** Apply batch normalization on tensors with NHWC format.
+ *
+ * @note It is possible to select the activation function to apply using -DACTIVATION_TYPE e.g. -DACTIVATION_TYPE=relu
+ * @note A, B variables required by some activation functions are set using -DA_VAL= and -DB_VAL= respectively
+ *
+ * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
+ * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p input_ptr
+ * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
+ * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
+ * @param[in] var_ptr Pointer to the var tensor. Supported data types: same as @p input_ptr
+ * @param[in] var_stride_x Stride of the var tensor in X dimension (in bytes)
+ * @param[in] var_step_x var_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] var_offset_first_element_in_bytes The offset of the first element in the var source tensor
+ * @param[in] beta_ptr Pointer to the beta source tensor. Supported data types: same as @p input_ptr
+ * @param[in] beta_stride_x Stride of the beta source tensor in X dimension (in bytes)
+ * @param[in] beta_step_x beta_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] beta_offset_first_element_in_bytes The offset of the first element in the beta source tensor
+ * @param[in] gamma_ptr Pointer to the gamma source tensor. Supported data types: same as @p input_ptr
+ * @param[in] gamma_stride_x Stride of the gamma source tensor in X dimension (in bytes)
+ * @param[in] gamma_step_x gamma_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] gamma_offset_first_element_in_bytes The offset of the first element in the gamma source tensor
+ * @param[in] epsilon Epsilon parameter in the batch normalization equation
+ */
+__kernel void batchnormalization_layer_nhwc(TENSOR3D_DECLARATION(input),
+#ifndef IN_PLACE
+ TENSOR3D_DECLARATION(output),
+#endif /* not IN_PLACE */
+ VECTOR_DECLARATION(mean),
+ VECTOR_DECLARATION(var),
+#ifndef USE_DEFAULT_BETA
+ VECTOR_DECLARATION(beta),
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ VECTOR_DECLARATION(gamma),
+#endif /* USE_DEFAULT_GAMMA */
+ float epsilon)
+{
+ uint x_offs = max((int)(get_global_id(0) * VEC_SIZE * sizeof(DATA_TYPE) - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE * sizeof(DATA_TYPE)), 0);
+
+ __global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs + get_global_id(1) * input_stride_y + get_global_id(2) * input_stride_z;
+#ifdef IN_PLACE
+ __global uchar *output_addr = input_ptr;
+#else /* IN_PLACE */
+ __global uchar *output_addr = output_ptr + output_offset_first_element_in_bytes + x_offs + get_global_id(1) * output_stride_y + get_global_id(2) * output_stride_z;
+#endif /* IN_PLACE */
+ __global uchar *mean_addr = mean_ptr + mean_offset_first_element_in_bytes + x_offs;
+ __global uchar *var_addr = var_ptr + var_offset_first_element_in_bytes + x_offs;
+#ifndef USE_DEFAULT_BETA
+ __global uchar *beta_addr = beta_ptr + beta_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_BETA */
+#ifndef USE_DEFAULT_GAMMA
+ __global uchar *gamma_addr = gamma_ptr + gamma_offset_first_element_in_bytes + x_offs;
+#endif /* USE_DEFAULT_GAMMA */
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ denominator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ numerator = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ x_bar = 0;
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res0 = 0;
+
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)input_addr);
+ denominator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)var_addr);
+ denominator = INVSQRT_OP(ADD_OP(denominator, ((VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(epsilon))));
+
+ // Calculate x bar and store results
+ numerator = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)mean_addr);
+ numerator = SUB_OP(data, numerator);
+ x_bar = MUL_OP(numerator, denominator);
+
+#ifndef USE_DEFAULT_GAMMA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ gamma_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)gamma_addr);
+
+ res0 = MUL_OP(gamma_vec, x_bar);
+#else /* USE_DEFAULT_GAMMA */
+ // gamma is equal to 1, no need to perform multiplications
+ res0 = x_bar;
+#endif /* USE_DEFAULT_GAMMA */
+
+#ifndef USE_DEFAULT_BETA
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ beta_vec = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)beta_addr);
+ // beta is not zero, hence we need to perform the addition
+ res0 = ADD_OP(res0, beta_vec);
+#endif /* USE_DEFAULT_BETA */
+
+ res0 = ACTIVATION(ACTIVATION_TYPE, DATA_TYPE, VEC_SIZE, res0, A_VAL, B_VAL);
+
+ STORE_VECTOR_SELECT(res, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0)
+}
+#endif /* defined(VEC_SIZE) && defined(DATA_TYPE) && defined(DATA_TYPE)*/ \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/channel_shuffle.cl b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
index 63af2c6137..233beb3aa9 100644
--- a/src/core/CL/cl_kernels/channel_shuffle.cl
+++ b/src/core/CL/cl_kernels/nhwc/channel_shuffle.cl
@@ -38,68 +38,6 @@
mod_res = (x)-r; \
})
-/** Performs channel shuffle when the data layout is NCHW. See https://arxiv.org/pdf/1707.01083.pdf for details.
- *
- * @note The vector size must be given as a preprocessor argument using -DVEC_SIZE=num. e.g. -DVEC_SIZE=4
- * @note The depth of the tensor must be given as a preprocessor argument using -DSRC_DIM_Z=num. e.g. -DSRC_DIM_Z=64
- * @note The number of groups must be given as a preprocessor argument using -DNUM_GROUPS=num_groups. e.g. -DNUM_GROUPS=2
- * @note The number of channels in each group must be given as a preprocessor argument using -DK=num. e.g. -DK=1
- * K is equal to num_channels / num_groups.
- *
- * @param[in] src_ptr Pointer to the source matrix. Supported data types: All
- * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src_step_w src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_stride_w Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void channel_shuffle_nchw(TENSOR4D_DECLARATION(src),
- TENSOR4D_DECLARATION(dst))
-{
- uint curr_channel = 0; // channel id of input
- uint batch_id = 0; // batch id
- uint group_id = 0; // group id
- uint channel_id = 0; // channel id within the group
-
- // Compute curr_channel and batch_id
- DIV_MOD_UINT(get_global_id(2), SRC_DIM_Z, batch_id, curr_channel);
-
- // Compute group_id and channel_id
- DIV_MOD_UINT(curr_channel, K, group_id, channel_id);
-
- const uint x = get_global_id(0) * VEC_SIZE;
- const uint y = get_global_id(1) * 2;
- const uint z = channel_id * NUM_GROUPS + group_id;
-
- // Load the Nx2 block
- const __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * src_stride_y + curr_channel * src_stride_z + batch_id * src_stride_w;
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- u0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 0 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- u1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_ptr + 1 * src_stride_y));
-
- // Store blocks
- __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + x * sizeof(DATA_TYPE) + y * dst_stride_y + z * dst_stride_z + batch_id * dst_stride_w;
- VSTORE(VEC_SIZE)
- (u0, 0, (__global DATA_TYPE *)(output_ptr + 0 * dst_stride_y));
- VSTORE(VEC_SIZE)
- (u1, 0, (__global DATA_TYPE *)(output_ptr + 1 * dst_stride_y));
-}
-
#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
/** Performs channel shuffle when the data layout is NHWC. See https://arxiv.org/pdf/1707.01083.pdf for details.
@@ -219,4 +157,4 @@ __kernel void channel_shuffle_nhwc(TENSOR4D_DECLARATION(src),
STORE_VECTOR_SELECT(out, DATA_TYPE, output_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_DIM_X)
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(NUM_GROUPS) && defined(K) && defined(SRC_DIM_Z) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/depth_to_space.cl b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
new file mode 100644
index 0000000000..5464a4bef8
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/depth_to_space.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Depth to space transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor depth size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All.
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void depth_to_space_nhwc(
+ TENSOR3D_DECLARATION(input),
+ const int batch_id,
+ TENSOR4D_DECLARATION(output))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor4D out = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(output, 0);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0) % r;
+
+ const int out_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+ const int out_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)tensor4D_offset(&out, z, out_x, out_y, batch_id)) = *((__global DATA_TYPE *)in.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
new file mode 100644
index 0000000000..238d3a7921
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/dequantization_layer.cl
@@ -0,0 +1,87 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST)
+/** This performs per channel dequantization of 8-bit signed integers to floating point. (NHWC)
+ *
+ * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char
+ * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float
+ * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: QSYMM8_PER_CHANNEL
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] scale Pointer to buffer with the per channel quantized scales
+ */
+__kernel void dequantization_layer_per_channel_nhwc(
+ TENSOR3D_DECLARATION(input),
+ TENSOR3D_DECLARATION(output),
+ __global float *scale)
+{
+ // Get pixels pointer
+ Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
+ Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+#if defined(LAST_ACCESSED_X)
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x;
+ output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x;
+ scale -= max(xi - (int)LAST_ACCESSED_X, 0);
+
+ // Load data
+ VEC_DATA_TYPE(int, VEC_SIZE)
+ val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE));
+
+ // Create scale vectors
+ const VEC_DATA_TYPE(float, VEC_SIZE)
+ vscale = VLOAD(VEC_SIZE)(0, &scale[xi]);
+
+ // Dequantize
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ res = vscale * CONVERT((val), VEC_DATA_TYPE(float, VEC_SIZE));
+
+ // Store result
+ VSTORE(VEC_SIZE)
+ (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr);
+#else // !defined(LAST_ACCESSED_X)
+ *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr)))) * scale[get_global_id(0)]);
+#endif // defined(LAST_ACCESSED_X)
+}
+#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/direct_convolution.cl b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
index 75a7a0f004..75a7a0f004 100644
--- a/src/core/CL/cl_kernels/direct_convolution.cl
+++ b/src/core/CL/cl_kernels/nhwc/direct_convolution.cl
diff --git a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
index 1f940001f3..d2e7e45ada 100644
--- a/src/core/CL/cl_kernels/dwc_native_fp_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_fp_nhwc.cl
@@ -125,7 +125,7 @@ __kernel void dwc_native_fp_nhwc(
const int bout = GET_SPATIAL_IDX(2, 1, 0) / _IDST_HEIGHT; // BATCH SIZE IDX
#else // defined(BATCHED_EXECUTION)
const int yo = GET_SPATIAL_IDX(2, 1, 0); // HEIGHT
- const int bout = 0; // BATCH SIZE IDX
+ const int bout = 0; // BATCH SIZE IDX
#endif // defined(BATCHED_EXECUTION)
int xi = xo * STRIDE_X;
@@ -148,37 +148,37 @@ __kernel void dwc_native_fp_nhwc(
#if _IWEI_HEIGHT <= 5
LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-#else // _IWEI_HEIGHT <= 5
+#else // _IWEI_HEIGHT <= 5
for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
#endif // _IWEI_HEIGHT <= 5
- {
- TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+ {
+ TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
- LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
- {
- a[i].v = 0;
- })
+ LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+ {
+ a[i].v = 0;
+ })
- // Load tile from the src tensor (TILE A)
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+ // Load tile from the src tensor (TILE A)
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
- TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+ TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
- // Load tile from the weights tensor (TILE B)
- T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
+ // Load tile from the weights tensor (TILE B)
+ T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, (cout * DEPTH_MULTIPLIER) + d, yk * _IM0_B, 1, wei_stride_y, b);
- // Optimized path for STRIDE_X == 1
- // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
- LOOP_UNROLLING(int, m0, 0, 1, M0,
+ // Optimized path for STRIDE_X == 1
+ // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
{
- LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
- {
- c[m0].v += a[xk + m0].v * b[xk].v;
- })
+ c[m0].v += a[xk + m0].v *b[xk].v;
})
- }
+ })
+ }
#if _IWEI_HEIGHT <= 5
- )
+ )
#endif // _IWEI_HEIGHT <= 5
#if defined(HAS_BIAS)
diff --git a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
index aa6ba4de39..1bc58b6e26 100644
--- a/src/core/CL/cl_kernels/dwc_native_quantized_nhwc.cl
+++ b/src/core/CL/cl_kernels/nhwc/dwc_native_quantized_nhwc.cl
@@ -179,61 +179,61 @@ __kernel void dwc_native_quantized_nhwc(
#if _IWEI_HEIGHT <= 5
LOOP_UNROLLING(int, yk, 0, 1, _IWEI_HEIGHT,
-#else // _IWEI_HEIGHT <= 5
+#else // _IWEI_HEIGHT <= 5
for(int yk = 0; yk < _IWEI_HEIGHT; yk++)
#endif // _IWEI_HEIGHT <= 5
- {
- TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
+ {
+ TILE(SRC_DATA_TYPE, _IM0_A, _IN0_A, a);
- LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
- {
- a[i].v = ZERO_VALUE;
- })
+ LOOP_UNROLLING(int, i, 0, 1, _IM0_A,
+ {
+ a[i].v = ZERO_VALUE;
+ })
- // Load tile from the src tensor (TILE A)
- T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
+ // Load tile from the src tensor (TILE A)
+ T_LOAD_NHWC_WITH_DILATION(SRC_DATA_TYPE, 1, _IM0_A, _IN0_A, SRC_TENSOR_TYPE, src, bout, yi + yk * DILATION_Y, xi, cout, _ISRC_WIDTH, _ISRC_HEIGHT, DILATION_X, 1, _IBOUNDARY_CHECK, a);
- TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
+ TILE(WEI_DATA_TYPE, _IM0_B, _IN0_B, b);
- // Load tile from the weights tensor (TILE B)
- T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
+ // Load tile from the weights tensor (TILE B)
+ T_LOAD(WEI_DATA_TYPE, _IM0_B, _IN0_B, WEI_TENSOR_TYPE, wei, cout * DEPTH_MULTIPLIER + d, yk * _IM0_B, 1, wei_stride_y, b);
- // Optimized path for STRIDE_X == 1
- // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
- LOOP_UNROLLING(int, m0, 0, 1, M0,
+ // Optimized path for STRIDE_X == 1
+ // If M0 != 1, we can skip the common loads between the two applied kernels on the X (WIDTH) dimension
+ LOOP_UNROLLING(int, m0, 0, 1, M0,
+ {
+ LOOP_UNROLLING(int, n0, 0, 1, N0,
{
- LOOP_UNROLLING(int, n0, 0, 1, N0,
- {
#if _IWEI_WIDTH <= 16
#define DOT_DATA_TYPE SRC_DATA_TYPE
#define WEI_OFFSET_CORRECTION (CALCULATE_WEIGHTS_OFFSET_CORRECTION(SRC_DATA_TYPE, WEI_DATA_TYPE))
- // Optimized path for the dot instruction
- TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
- TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
- ACC_DATA_TYPE offset_a = 0;
- ACC_DATA_TYPE offset_b = 0;
+ // Optimized path for the dot instruction
+ TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, x0);
+ TILE(DOT_DATA_TYPE, 1, _IWEI_WIDTH, y0);
+ ACC_DATA_TYPE offset_a = 0;
+ ACC_DATA_TYPE offset_b = 0;
- LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
- {
- x0[0].s[xk] = a[xk + m0].s[n0];
- y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
- })
- DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
- REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
- REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
- c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+ {
+ x0[0].s[xk] = a[xk + m0].s[n0];
+ y0[0].s[xk] = b[xk].s[n0] + (int)WEI_OFFSET_CORRECTION;
+ })
+ DOT_PRODUCT_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, y0[0].v, c[m0].s[n0]);
+ REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, x0[0].v, offset_a);
+ REDUCE_INTEGER8(DOT_DATA_TYPE, DOT_DATA_TYPE, ACC_DATA_TYPE, _IWEI_WIDTH, y0[0].v, offset_b);
+ c[m0].s[n0] += offset_a * (ACC_DATA_TYPE)(WEI_OFFSET - (ACC_DATA_TYPE)WEI_OFFSET_CORRECTION) + offset_b * (ACC_DATA_TYPE)SRC_OFFSET;
#else // _IWEI_WIDTH <= 16
- LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
- {
- c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
- })
-#endif // _IWEI_WIDTH <= 16
+ LOOP_UNROLLING(int, xk, 0, 1, _IWEI_WIDTH,
+ {
+ c[m0].s[n0] += ((ACC_DATA_TYPE)a[xk + m0].s[n0] + (ACC_DATA_TYPE)(SRC_OFFSET)) * ((ACC_DATA_TYPE)b[xk].s[n0] + (ACC_DATA_TYPE)(WEI_OFFSET));
})
+#endif // _IWEI_WIDTH <= 16
})
- }
+ })
+ }
#if _IWEI_HEIGHT <= 5
- )
+ )
#endif // _IWEI_HEIGHT <= 5
#if _IWEI_WIDTH <= 16
diff --git a/src/core/CL/cl_kernels/nhwc/im2col.cl b/src/core/CL/cl_kernels/nhwc/im2col.cl
new file mode 100644
index 0000000000..ac00c11283
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/im2col.cl
@@ -0,0 +1,532 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#if defined(DATA_TYPE) && defined(ELEMENT_SIZE)
+
+#if ELEMENT_SIZE == 1
+#define COND_DATA_TYPE char
+#elif ELEMENT_SIZE == 2
+#define COND_DATA_TYPE short
+#elif ELEMENT_SIZE == 4
+#define COND_DATA_TYPE int
+#else // ELEMENT_SIZE
+#error "Element size not support"
+#endif // ELEMENT_SIZE
+
+#if defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
+
+#define VECTOR_N VEC_DATA_TYPE(DATA_TYPE, VECTOR_SIZE)
+#define COND_N VEC_DATA_TYPE(COND_DATA_TYPE, VECTOR_SIZE)
+
+/** Store a 1x9 row or a 3x3 block in a boundary-aware manner to avoid paddings in the channel dimension
+ * @name IM2COL1X9_NHWC_STORE
+ *
+ * @note To use this macro for a 3x3 block, @p ROW has to be 0
+ *
+ * @param[in] VECTOR_SIZE The non-boundary vector width of @p DATA. Supported: 1(scalar), 2, 3, 4, 8, 16
+ * @param[in] BOUNDARY_VECTOR_SIZE The boundary vector width of @p DATA. Supported: 1-16, but has to be <= @p size
+ * @param[in] DATA_TYPE Data type of @p DATA
+ * @param[in] SRC_DEPTH Input channel size / depth
+ * @param[in] DATA Value variable base name
+ * @param[in] ROW The row number to store. Supported: 0-8
+ * @param[in] OUTPUT_PTR Output pointer
+ * @{
+ */
+#if defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ const bool at_channel_boundary = get_global_id(0) == 0; \
+ if(at_channel_boundary) \
+ { \
+ IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ } \
+ else \
+ { \
+ IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ }
+#else // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+#define IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR)
+#endif // defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE) && BOUNDARY_VECTOR_SIZE < VECTOR_SIZE
+
+#define IM2COL1X9_NHWC_STORE_NONPARTIAL(VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \
+ VSTORE(VECTOR_SIZE) \
+ (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+
+#define IM2COL1X9_NHWC_STORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, DATA, ROW, OUTPUT_PTR) \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##0, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (0 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##1, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (1 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##2, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (2 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##3, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (3 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##4, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (4 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##5, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (5 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##6, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (6 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##7, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (7 + ROW * 9) * SRC_DEPTH); \
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE) \
+ (DATA##8, 0, (__global DATA_TYPE *)(OUTPUT_PTR) + (8 + ROW * 9) * SRC_DEPTH);
+/** @}*/
+
+/** This kernel performs im2col when the kernel size is 3x3 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col3x3_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+ const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+ const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
+
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int yi_coord = 0;
+ int3 offset = 0;
+
+ // Clamp xi
+ int3 xi_offset = ((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT);
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ xi_offset = CLAMP(xi_offset, (int3)0, (int3)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+ // Multiply by src_stride_y as the width (X) dimension here is the second (y) dimension in src NHWC tensor
+ xi_offset *= (int3)src_stride_y;
+
+ // Out-of-bound condition for X
+ int3 x_cond = (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) < (int3)0) || (((int3)xi + (int3)(0, 1, 2) * DILATION_X - (int3)PAD_LEFT) >= (int3)SRC_WIDTH);
+
+ // yi == 0
+ // Clamp yi
+ // yi_coord is casted to unsigned int in order to use just a min() operation
+ // A "-1" 32 bit signed variable converted to unsigned gives 4294967295
+ // This is a trick so that the values loaded in the padding areas are always from the last row (SRC_HEIGHT - 1),
+ // because of the negative yi_coord wrap-around, but it gets overwritten by PAD_VALUE immediately as the wrap-around
+ // also causes y_cond (y padding condition) to be satisfied
+ yi_coord = yi - (int)PAD_TOP;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ int y_cond = (int)((uint)(yi - (int)PAD_TOP) >= (uint)(SRC_HEIGHT));
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+ values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+ values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 1
+ // Clamp yi_coord (it can be negative if PAD_TOP > 1)
+ yi_coord = yi - (int)PAD_TOP + 1 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with zeros
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 1 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+ values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+ values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+ values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // yi == 2
+ // Clamp yi_coord
+ yi_coord = yi - (int)PAD_TOP + 2 * DILATION_Y;
+
+ // Clamp only if PAD_TOP or PAD_BOTTOM is not equal to 0
+#if PAD_TOP != 0 || PAD_BOTTOM != 0
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1));
+#endif // PAD_TOP != 0 || PAD_BOTTOM != 0
+
+ // Compute offset
+ offset = xi_offset + (yi_coord * (int)src_stride_z);
+
+ // Load input values
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s0));
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s1));
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset.s2));
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+ // Replace invalid values with PAD_VALUE
+ y_cond = (int)((uint)(yi - (int)PAD_TOP + 2 * DILATION_Y) >= (uint)(SRC_HEIGHT));
+ values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s0)));
+ values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s1)));
+ values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond.s2)));
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+ // Store in a boundary-aware way to avoid padding
+ IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, 0, output_ptr)
+
+#ifdef HAS_BIAS
+ // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+ // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+ // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+ // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+ // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+ {
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 9) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+
+#if PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i) \
+ ({ \
+ yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
+ \
+ offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
+ offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
+ \
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
+ \
+ int y_cond = (int)((uint)(yi - (int)PAD_TOP + i * DILATION_Y) >= (uint)(SRC_HEIGHT)); \
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s0))); \
+ values1 = select(values1, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s1))); \
+ values2 = select(values2, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s2))); \
+ values3 = select(values3, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s3))); \
+ values4 = select(values4, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s4))); \
+ values5 = select(values5, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s5))); \
+ values6 = select(values6, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s6))); \
+ values7 = select(values7, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond0.s7))); \
+ values8 = select(values8, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)y_cond || (COND_N)(x_cond1))); \
+ \
+ IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+ })
+#else // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+#define IM2COL1x9(i) \
+ ({ \
+ yi_coord = yi - (int)PAD_TOP + i * DILATION_Y; \
+ yi_coord = min((uint)yi_coord, (uint)(SRC_HEIGHT - 1)); \
+ \
+ offset0 = xi_offset0 + (yi_coord * (int)src_stride_z); \
+ offset1 = xi_offset1 + (yi_coord * (int)src_stride_z); \
+ \
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s0)); \
+ VECTOR_N values1 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s1)); \
+ VECTOR_N values2 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s2)); \
+ VECTOR_N values3 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s3)); \
+ VECTOR_N values4 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s4)); \
+ VECTOR_N values5 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s5)); \
+ VECTOR_N values6 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s6)); \
+ VECTOR_N values7 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset0.s7)); \
+ VECTOR_N values8 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset1)); \
+ \
+ IM2COL1X9_NHWC_STORE(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE, DATA_TYPE, SRC_DEPTH, values, i, output_ptr) \
+ })
+#endif // PAD_TOP != 0 || PAD_LEFT != 0 || PAD_BOTTOM != 0 || PAD_RIGHT != 0
+
+/** This kernel performs im2col when the kernel size is 9x9 and the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel depth must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=3
+ * @note The stride along the Y direction must be passed at compile time using -DSTRIDE_Y: e.g. -DSTRIDE_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col9x9_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+ const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+ const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
+
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int yi_coord = 0;
+ int8 offset0 = 0;
+ int offset1 = 0;
+
+ // Clamp xi
+ int8 xi_offset0 = ((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT);
+ int xi_offset1 = ((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT);
+
+#if PAD_LEFT != 0 || PAD_RIGHT != 0
+#define CLAMP(x, min_val, max_val) min(max(x, min_val), max_val)
+ xi_offset0 = CLAMP(xi_offset0, (int8)0, (int8)(SRC_WIDTH - 1));
+ xi_offset1 = CLAMP(xi_offset1, (int)0, (int)(SRC_WIDTH - 1));
+#endif // PAD_LEFT != 0 || PAD_RIGHT != 0
+ xi_offset0 *= (int8)src_stride_y;
+ xi_offset1 *= (int)src_stride_y;
+
+ // Out-of-bound condition for X
+ int8 x_cond0 = (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) < (int8)0) || (((int8)xi + (int8)(0, 1, 2, 3, 4, 5, 6, 7) * DILATION_X - (int8)PAD_LEFT) >= (int8)SRC_WIDTH);
+ int x_cond1 = (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) < (int)0) || (((int)xi + (int)(8) * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+ IM2COL1x9(0);
+ IM2COL1x9(1);
+ IM2COL1x9(2);
+ IM2COL1x9(3);
+ IM2COL1x9(4);
+ IM2COL1x9(5);
+ IM2COL1x9(6);
+ IM2COL1x9(7);
+ IM2COL1x9(8);
+
+#ifdef HAS_BIAS
+ // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+ // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+ // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+ // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+ // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+ {
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * 81) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+
+/** This opencl kernel performs a generic im2col implementation when the data layout is NHWC
+ *
+ * @note This kernel computes VECTOR_SIZE elements
+ * @note This kernel stores VECTOR_SIZE or BOUNDARY_VECTOR_SIZE (if at boundary) elements
+ * @note The vector size must be passed at compile time using -DVECTOR_SIZE: e.g. -DVECTOR_SIZE=2
+ * @note The boundary vector size must be passed at compile time using -DBOUNDARY_VECTOR_SIZE: e.g. -DBOUNDARY_VECTOR_SIZE=1
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The width and height of the input tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT: e.g. -DSRC_WIDTH=128 and -DSRC_HEIGHT=128
+ * @note The width of output tensor after matrix multiplication must be passed at compile time using -DCONVOLVED_WIDTH: e.g. -DCONVOLVED_WIDTH=34
+ * @note The kernel width, height and depth must be passed at compile time using -DKERNEL_WIDTH, -DKERNEL_HEIGHT and -DSRC_DEPTH: e.g. -DKERNEL_WIDTH=3, -DKERNEL_HEIGHT=3 and -DSRC_DEPTH=64
+ * @note The pad_left, pad_right, pad_top and pad_bottom must be passed at compile time using -DPAD_LEFT, -DPAD_RIGHT, -DPAD_TOP and -DPAD_BOTTOM: e.g. -DPAD_LEFT=1, -DPAD_RIGHT=2, -DPAD_TOP=3 and -DPAD_BOTTOM=2
+ * @note The zero value to store in case we load values out-of-bounds must be passed at compile time using -DPAD_VALUE: e.g. -DPAD_VALUE=0.0
+ * @note The stride along the X and Y directions must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y: e.g. -DSTRIDE_X=1 and -DSTRIDE_Y=1
+ * @note The dilation_x and dilation_y must be passed at compile time using -DDILATION_X and -DDILATION_Y: e.g. -DDILATION_X=1, -DDILATION_Y=1
+ * @note In case biases will be added to the convolution -DHAS_BIAS has to be passed to append the final matrix with 1 in each row.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8_SIGNED/QASYMM8/F16/F32
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes).
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes).
+ */
+__kernel void im2col_generic_nhwc(
+ TENSOR3D_DECLARATION(src),
+ IMAGE_DECLARATION(dst),
+ uint src_stride_w,
+ uint dst_stride_w)
+{
+ // input feature map, boundary-corrected (shift all non-boundary vectors by shift_amount) to avoid padding
+ const int shift_amount = (int)VECTOR_SIZE - (int)BOUNDARY_VECTOR_SIZE;
+ const int ch = max((int)(get_global_id(0) * VECTOR_SIZE) - shift_amount, 0);
+ const int yo = get_global_id(1);
+ const int batch = get_global_id(2); // batch size
+
+ // Calculate input indices
+ const int xi = (get_global_id(1) % CONVOLVED_WIDTH) * STRIDE_X;
+ const int yi = (get_global_id(1) / (int)CONVOLVED_WIDTH) * STRIDE_Y;
+
+ // Get input and output address
+ __global uchar *input_ptr = src_ptr + src_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + batch * (int)src_stride_w;
+ __global uchar *output_ptr = dst_ptr + dst_offset_first_element_in_bytes + ch * sizeof(DATA_TYPE) + yo * (int)dst_stride_y + batch * (int)dst_stride_w;
+
+ int i = 0;
+ for(int yk = 0; yk < KERNEL_HEIGHT; ++yk)
+ {
+ // Clamp yi_coord
+ int yi_coord = yi + yk * DILATION_Y - (int)PAD_TOP;
+ yi_coord = CLAMP(yi_coord, (int)0, (int)(SRC_HEIGHT - 1));
+
+ // Out-of-bound condition for Y
+ int y_border_condition = ((yi + yk * DILATION_Y - (int)PAD_TOP) < (int)0) || ((yi + yk * DILATION_Y - (int)PAD_TOP) >= (int)SRC_HEIGHT);
+
+ for(int xk = 0; xk < KERNEL_WIDTH; ++xk)
+ {
+ // Clamp xi_coord
+ int xi_coord = (xi + xk * DILATION_X - (int)PAD_LEFT);
+ xi_coord = CLAMP(xi_coord, (int)0, (int)(SRC_WIDTH - 1));
+
+ // Out-of-bound condition for X
+ int x_border_condition = ((xi + xk * DILATION_X - (int)PAD_LEFT) < (int)0) || ((xi + xk * DILATION_X - (int)PAD_LEFT) >= (int)SRC_WIDTH);
+
+ int offset = xi_coord * (int)src_stride_y + (yi_coord * (int)src_stride_z);
+
+ VECTOR_N values0 = VLOAD(VECTOR_SIZE)(0, (__global DATA_TYPE *)(input_ptr + offset));
+
+#if PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+ // Replace with PAD_VALUE if the value is out-of-bound
+ values0 = select(values0, (VECTOR_N)PAD_VALUE, (COND_N)((COND_N)x_border_condition || (COND_N)(y_border_condition)));
+#endif // PAD_LEFT != 0 || PAD_TOP != 0 || PAD_RIGHT != 0 || PAD_BOTTOM != 0
+
+ // Store in a boundary-aware way to avoid padding
+#if BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+ const bool at_channel_boundary = get_global_id(0) == 0;
+ if(at_channel_boundary)
+ {
+ VSTORE_PARTIAL(VECTOR_SIZE, BOUNDARY_VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+ }
+ else // at_channel_boundary
+#endif // BOUNDARY_VECTOR_SIZE != VECTOR_SIZE
+ {
+ VSTORE(VECTOR_SIZE)
+ (values0, 0, (__global DATA_TYPE *)(output_ptr) + i * (int)SRC_DEPTH);
+ }
+ i++;
+ }
+ }
+
+#ifdef HAS_BIAS
+ // We can use VECTOR_SIZE instead of BOUNDARY_VECTOR_SIZE even if it's at the boundary. This is because the bias is
+ // added at the end of the channel, while the boundary vec is at the beginning of the channel.
+ // The only case where the boundary vec is at the end of the channel is when there's only a single boundary vec in
+ // the whole channel dimension, but in that case VECTOR_SIZE is also equal to BOUNDARY_VECTOR_SIZE
+ // See the value of num_elems_processed_per_iteration in configure_opencl_kernel method in CLIm2ColKernel.cpp
+ if((ch + VECTOR_SIZE) >= SRC_DEPTH)
+ {
+ *((__global DATA_TYPE *)(output_ptr) - ch + SRC_DEPTH * KERNEL_WIDTH * KERNEL_HEIGHT) = 1.0f;
+ }
+#endif // HAS_BIAS
+}
+#endif // defined(CONVOLVED_WIDTH) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(STRIDE_X) && defined(STRIDE_Y) && defined(KERNEL_WIDTH) && defined(KERNEL_HEIGHT) && defined(SRC_DEPTH) && defined(PAD_LEFT) && defined(PAD_RIGHT) && defined(PAD_TOP) && defined(PAD_BOTTOM) && defined(PAD_VALUE) && defined(VECTOR_SIZE) && defined(BOUNDARY_VECTOR_SIZE)
+#endif // defined(DATA_TYPE) && defined(ELEMENT_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalization_layer.cl b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
index 4569208824..7e35e161c8 100644
--- a/src/core/CL/cl_kernels/normalization_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/normalization_layer.cl
@@ -30,69 +30,6 @@
#define POW_OP(x, y) pow((x), (y))
#define SQCVT_SAT(a) (a)
-#if defined(NUM_SLICES)
-/** Apply cross-map normalization.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note The number of slices should be given as a preprocessor argument using -DNUM_SLICES=size. e.g. -DNUM_SLICES=192
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- *
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_cross_map_nchw(TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- acc = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))0;
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- coeff_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(COEFF);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- beta_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(BETA);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- kappa_v = (VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE))SQCVT_SAT(KAPPA);
-
- const int current_slice = get_global_id(2);
- const int left_slice = max(-(int)RADIUS, -current_slice);
- const int right_slice = min((int)RADIUS, (int)NUM_SLICES - 1 - current_slice);
-
- for(int i = left_slice; i <= right_slice; i++)
- {
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, 0, 0, i));
- acc = ADD_OP(acc, MUL_OP(values, values));
- }
-
- acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- normalized = POW_OP(acc, beta_v);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
- VSTORE(VEC_SIZE)
- (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
-#endif /* defined(NUM_SLICES) */
-
#if defined(WIDTH_SIZE)
/** Apply cross-map normalization.
*
@@ -156,85 +93,6 @@ __kernel void normalization_layer_cross_map_nhwc(TENSOR3D_DECLARATION(input),
STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
-
-/** Apply in-map normalization when tensors are in the NCHW data layout format.
- *
- * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=short
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size, e.g. -DVEC_SIZE=16
- * @note The radius should be given as a preprocessor argument using -DRADIUS=size. e.g. -DRADIUS=5
- * @note Scaling coefficient (= alpha/norm_size), beta and kappa need to be passed at compile time using -DCOEFF, -DALPHA and -DKAPPA
- * @note The leftover size in the X dimension shoud be given as preprocessor argument using -DVEC_SIZE_LEFTOVER is; x_dimension % VEC_SIZE. e.g. -DVEC_SIZE_LEFTOVER=1
- *
- * @param[in] input_ptr Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the first destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void normalization_layer_in_map_nchw(TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- acc = 0;
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- coeff_v = SQCVT_SAT(COEFF);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- beta_v = SQCVT_SAT(BETA);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- kappa_v = SQCVT_SAT(KAPPA);
-
- const int current_col = get_global_id(0) << 2;
- const int left_pos = max(-(int)RADIUS, -3 - current_col);
- const int right_pos = min((int)RADIUS, (int)WIDTH_SIZE - 1 - current_col);
-
-#if defined(IN_MAP_2D)
- const int current_row = get_global_id(1);
- const int first_row = max(-(int)RADIUS, -current_row);
- const int last_row = min((int)RADIUS, (int)get_global_size(1) - 1 - current_row);
-#endif /* defined(IN_MAP_2D) */
-
-#if defined(IN_MAP_2D)
- for(int j = first_row; j <= last_row; ++j)
- {
-#endif /* defined(IN_MAP_2D) */
- for(int i = left_pos; i <= right_pos; ++i)
- {
-#if defined(IN_MAP_2D)
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, j, 0));
-#else /* defined(IN_MAP_2D) */
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&in, i, 0, 0));
-#endif /* defined(IN_MAP_2D) */
- acc = ADD_OP(acc, MUL_OP(values, values));
- }
-#if defined(IN_MAP_2D)
- }
-#endif /* defined(IN_MAP_2D) */
-
- acc = ADD_OP(MUL_OP(acc, coeff_v), kappa_v);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- normalized = POW_OP(acc, beta_v);
- const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- normalized_pixel = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr), normalized);
-
- VSTORE(VEC_SIZE)
- (normalized_pixel, 0, (__global DATA_TYPE *)out.ptr);
-}
#endif // defined(WIDTH_SIZE)
#if defined(NUM_SLICES) && defined(DIM1_SIZE)
@@ -267,9 +125,9 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
TENSOR3D_DECLARATION(output))
{
// Offset computation
- const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
- const int current_cols = get_global_id(1);
- const int current_rows = get_global_id(2);
+ const uint x_offs = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+ const int current_cols = get_global_id(1);
+ const int current_rows = get_global_id(2);
// Address computation
__global uchar *input_addr = input_ptr + input_offset_first_element_in_bytes + x_offs * sizeof(DATA_TYPE);
@@ -284,8 +142,8 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
kappa_v = SQCVT_SAT(KAPPA);
- const int first_col = max(0, current_cols - (int)RADIUS);
- const int last_col = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
+ const int first_col = max(0, current_cols - (int)RADIUS);
+ const int last_col = min((int)DIM1_SIZE - 1, current_cols + (int)RADIUS);
#if defined(IN_MAP_2D)
const int first_row = max(0, current_rows - (int)RADIUS);
@@ -312,7 +170,7 @@ __kernel void normalization_layer_in_map_nhwc(TENSOR3D_DECLARATION(input),
const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
normalized = POW_OP(acc, beta_v);
const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows * output_stride_z)), normalized);
+ normalized_pixel0 = DIV_OP(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(input_addr + current_cols * output_stride_y + current_rows *output_stride_z)), normalized);
STORE_VECTOR_SELECT(normalized_pixel, DATA_TYPE, output_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
index 0a098356b4..86c33499e2 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer.cl
@@ -27,59 +27,6 @@
#define TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- *
- * @param[in] src_ptr Pointer to the first source tensor. Supported data types: F16/F32
- * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
- * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
- * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_nchw(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
- VECTOR_DECLARATION(mean),
- VECTOR_DECLARATION(std))
-{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
- Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
- Vector std = CONVERT_TO_VECTOR_STRUCT(std);
-
- const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
- const DATA_TYPE curr_mean = *((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE)));
- const DATA_TYPE curr_std = *((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE)));
-
- TYPE data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr);
- TYPE res = (data - curr_mean) / curr_std;
-
- VSTORE(VEC_SIZE)
- (res, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
*
* @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -131,4 +78,4 @@ __kernel void normalize_planar_yuv_layer_nhwc(TENSOR3D_DECLARATION(src),
STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
index d660fffb58..7bc3c15a63 100644
--- a/src/core/CL/cl_kernels/normalize_planar_yuv_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.cl
@@ -29,76 +29,6 @@
#define OFFSET_FLT ((float)OFFSET)
#define SCALE_FLT ((float)SCALE)
-#if defined(NUM_CHANNELS)
-
-/** Apply normalize_planar_yuv layer on tensors with NCHW data layout.
- *
- * @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
- * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE e.g. -DVEC_SIZE=8
- * @note The depth of the input tensor should be given as a preprocessor argument using -DNUM_CHANNELS e.g. -DNUM_CHANNELS=8
- * @note The quantization offset should be given as a preprocessor argument using -DOFFSET e.g. -DOFFSET=8
- * @note The quantization scale should be given as a preprocessor argument using -DSCALE e.g. -DSCALE=8
- *
- * @param[in] src_ptr Pointer to the first source tensor. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] src_stride_x Stride of the first source tensor in X dimension (in bytes)
- * @param[in] src_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the first source tensor in Y dimension (in bytes)
- * @param[in] src_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the first source tensor in Z dimension (in bytes)
- * @param[in] src_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the first source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] dst_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] mean_ptr Pointer to the mean source tensor. Supported data types: same as @p src_ptr
- * @param[in] mean_stride_x Stride of the mean source tensor in X dimension (in bytes)
- * @param[in] mean_step_x mean_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] mean_offset_first_element_in_bytes The offset of the first element in the mean source tensor
- * @param[in] std_ptr Pointer to the std tensor. Supported data types: same as @p src_ptr
- * @param[in] std_stride_x Stride of the std tensor in X dimension (in bytes)
- * @param[in] std_step_x std_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] std_offset_first_element_in_bytes The offset of the first element in the var source tensor
- */
-__kernel void normalize_planar_yuv_layer_q8_nchw(TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst),
- VECTOR_DECLARATION(mean),
- VECTOR_DECLARATION(std))
-{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
- Vector mean = CONVERT_TO_VECTOR_STRUCT(mean);
- Vector std = CONVERT_TO_VECTOR_STRUCT(std);
-
- const uint current_slice = get_global_id(2) % NUM_CHANNELS;
-
- VEC_DATA_TYPE(float, VEC_SIZE)
- curr_mean_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(mean.ptr + current_slice * sizeof(DATA_TYPE))));
- curr_mean_flt = round(curr_mean_flt - OFFSET_FLT) * SCALE_FLT;
-
- VEC_DATA_TYPE(float, VEC_SIZE)
- curr_std_flt = (VEC_DATA_TYPE(float, VEC_SIZE))(*((__global DATA_TYPE *)(std.ptr + current_slice * sizeof(DATA_TYPE))));
- curr_std_flt = round(curr_std_flt - OFFSET_FLT) * SCALE_FLT;
-
- VEC_DATA_TYPE(float, VEC_SIZE)
- data_flt = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr), VEC_DATA_TYPE(float, VEC_SIZE));
- data_flt = round(data_flt - OFFSET_FLT) * SCALE_FLT;
-
- // Perform normalization
- VEC_DATA_TYPE(float, VEC_SIZE)
- res_flt = (data_flt - curr_mean_flt) / curr_std_flt;
-
- const TYPE res_u8 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
- VSTORE(VEC_SIZE)
- (res_u8, 0, (__global DATA_TYPE *)dst.ptr);
-}
-
-#endif // defined(NUM_CHANNELS)
-
/** Apply normalize_planar_yuv layer on tensors with NHWC data layout.
*
* @note Data type should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
@@ -163,4 +93,4 @@ __kernel void normalize_planar_yuv_layer_q8_nhwc(TENSOR3D_DECLARATION(src),
const TYPE res0 = CONVERT_SAT(round(res_flt / SCALE_FLT) + OFFSET_FLT, TYPE);
STORE_VECTOR_SELECT(res, DATA_TYPE, dst_addr, VEC_SIZE, VEC_SIZE_LEFTOVER, VEC_SIZE_LEFTOVER != 0 && get_global_id(0) == 0);
}
-#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE)
+#endif // defined(DATA_TYPE) && defined(VEC_SIZE) && defined(OFFSET) && defined(SCALE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/pooling_layer.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
new file mode 100644
index 0000000000..5b59ff5088
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer.cl
@@ -0,0 +1,364 @@
+/*
+ * Copyright (c) 2017-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "repeat.h"
+#include "tile_helpers.h"
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#define POOL_OP(x, y) ((x) + (y))
+#else /* defined(POOL_AVG) || defined(POOL_L2) */
+#define POOL_OP(x, y) (fmax((x), (y)))
+#endif /* defined(POOL_AVG) || defined(POOL_L2) */
+
+#if defined(POOL_L2)
+#define POW2_OP(x, vec_size) ((x) * (x))
+#else /* defined(POOL_L2) */
+#define POW2_OP(x, vec_size) (x)
+#endif /* defined(POOL_L2) */
+
+#define DIV_OP(x, y) (x * (1.f / y))
+#define SQRT_OP(x) sqrt((x))
+
+#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
+
+#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void pooling_layer_MxN_nhwc(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output))
+{
+ // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+ // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+ int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
+ int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
+#if DST_BATCH_SIZE != 1
+ // If batch size != 1, the batch size dimension is collapsed over the height dimension
+ int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
+ int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
+#else //DST_BATCH_SIZE != 1
+ int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
+ int idx_out_n = 0;
+#endif // DST_BATCH_SIZE != 1
+
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+ output_stride_w;
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ res0 = INITIAL_VALUE;
+
+ int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+ int pool_x_s = max((int)0, -idx_in_w);
+ int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
+
+#if defined(EXCLUDE_PADDING)
+ int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
+#else // defined(EXCLUDE_PADDING)
+ int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
+#endif // defined(EXCLUDE_PADDING)
+
+#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+ // Global pooling path
+ for(int y = 0; y < POOL_SIZE_Y; ++y)
+ {
+#pragma unroll 8
+ for(int x = 0; x < POOL_SIZE_X; ++x)
+ {
+#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+ for(int y = pool_y_s; y < pool_y_e; ++y)
+ {
+#pragma unroll 8
+ for(int x = pool_x_s; x < pool_x_e; ++x)
+ {
+#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ data0;
+#if defined(FP_MIXED_PRECISION)
+ // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+ data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+ data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+#endif // defined(POOL_L2)
+ res0 = POOL_OP(res0, data0);
+ }
+ }
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+ // Store result
+#if defined(FP_MIXED_PRECISION)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+ STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+ STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+}
+#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
+
+#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+
+/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
+ * -# max, -DPOOL_MAX must be passed at compile time
+ * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
+ * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
+ * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
+ *
+ * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
+ * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
+ * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
+ * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
+ * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
+ * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
+ * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
+ * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
+ * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
+ * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] indices_ptr (Optional) Pointer to the indices tensor. Supported data types: U32
+ * @param[in] indices_stride_x (Optional) Stride of the indices tensor in X dimension (in bytes)
+ * @param[in] indices_step_x (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] indices_stride_y (Optional) Stride of the indices tensor in Y dimension (in bytes)
+ * @param[in] indices_step_y (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] indices_stride_z (Optional) Stride of the indices tensor in Z dimension (in bytes)
+ * @param[in] indices_step_z (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] indices_stride_w (Optional) Stride of the indices tensor in W dimension (in bytes)
+ * @param[in] indices_step_w (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
+ */
+__kernel void pooling_layer_2x2_nhwc(
+ TENSOR4D_DECLARATION(input),
+ TENSOR4D_DECLARATION(output)
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+ ,
+ TENSOR4D_DECLARATION(indices)
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+)
+{
+ // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
+ // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
+ int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
+ int idx_out_w = get_global_id(1);
+#if DST_BATCH_SIZE != 1
+ // If batch size != 1, the batch size dimension is collapsed over the height dimension
+ int idx_out_h = get_global_id(2) % DST_HEIGHT;
+ int idx_out_n = get_global_id(2) / DST_HEIGHT;
+#else //SRC_BATCH_SIZE != 1
+ int idx_out_h = get_global_id(2);
+ int idx_out_n = 0;
+#endif // SRC_BATCH_SIZE != 1
+
+ int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
+ int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
+
+ __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
+
+ __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
+ output_stride_w;
+
+ int pool_x_s = max((int)0, -idx_in_w);
+ int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
+ int pool_y_s = max((int)0, -idx_in_h);
+ int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
+
+ int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
+
+ int x0 = pool_x_s + idx_in_w;
+ int y0 = pool_y_s + idx_in_h;
+ int x1 = pool_x_e - 1 + idx_in_w;
+ int y1 = pool_y_e - 1 + idx_in_h;
+
+ REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
+
+#if defined(FP_MIXED_PRECISION)
+ // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
+ data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+ data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+ data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+ data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
+#else // defined(FP_MIXED_PRECISION)
+ data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
+ data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
+ data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
+ data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
+#endif // defined(FP_MIXED_PRECISION)
+
+#if !defined(POOL_MAX)
+ if(filter_size != 4)
+ {
+ SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
+ SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
+ SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
+ SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
+
+ // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
+ data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
+ data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
+ data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
+ data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
+ }
+#endif // !defined(POOL_MAX)
+
+#if defined(POOL_L2)
+ // Raise to power of 2 for L2 Pooling
+ data0 *= data0;
+ data1 *= data1;
+ data2 *= data2;
+ data3 *= data3;
+#endif /* defined(POOL_L2) */
+
+ VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
+ res0 = data0;
+ res0 = POOL_OP(res0, data1);
+ res0 = POOL_OP(res0, data2);
+ res0 = POOL_OP(res0, data3);
+
+#if defined(POOL_AVG) || defined(POOL_L2)
+#if defined(EXCLUDE_PADDING)
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
+#else // !defined(EXCLUDE_PADDING)
+ res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
+#endif // defined(EXCLUDE_PADDING)
+#endif // defined(POOL_AVG) || defined(POOL_L2)
+
+#if defined(POOL_L2)
+ // Take square root of the result in L2 pooling
+ res0 = SQRT_OP(res0);
+#endif // defined(POOL_L2)
+
+ // Store result
+#if defined(FP_MIXED_PRECISION)
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
+ STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#else // defined(FP_MIXED_PRECISION)
+ STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
+#endif // defined(FP_MIXED_PRECISION)
+
+#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+
+ // This part is used to return the index of the maximum value
+ // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
+
+ // note: Batch dimension does not contribute in the offset contribution
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ base_index = (uint)idx_out_c;
+
+ base_index += VEC_OFFS(uint, VEC_SIZE);
+
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+ VEC_DATA_TYPE(uint, VEC_SIZE)
+ index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
+
+ index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
+ index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
+ index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
+
+ __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
+ indices_stride_w;
+
+ // Store result
+ STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
+#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
+}
+#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer_quantized.cl b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
index d8cef2b4e6..46268a4a88 100644
--- a/src/core/CL/cl_kernels/pooling_layer_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/pooling_layer_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017-2020 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -51,108 +51,6 @@
#error "L2 pooling is not supported"
#endif /* defined(POOL_L2) */
-int calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
- int start_x = get_global_id(0) * stride_x - pad_x;
- int start_y = get_global_id(1) * stride_y - pad_y;
- const int end_x = min(start_x + pool_size_x, upper_bound_w);
- const int end_y = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
- start_x = max(0, start_x);
- start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
- return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to N (NCHW)
- *
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG must be provided otherwise max pooling will be performed.
- * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note Input data type must be passed at compile time using -DDAT_TYPE=type, e.g. -DDATA_TYPE=uchar
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in] input_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED
- * @param[in] input_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] output_ptr Pointer to the destination image. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void pooling_layer_MxN_quantized_nchw(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- int8 vdata = INITIAL_VALUE;
- int sdata = INITIAL_VALUE;
-
- // Load data
- for(int y = 0; y < POOL_SIZE_Y; y++)
- {
- int x = 0;
- for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
- {
- VEC_TYPE(8)
- data = vload8(0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
- int8 data0 = convert_int8(data);
- vdata = POOL_OP(vdata, data0);
- }
-
- // Leftover
- for(; x < (int)POOL_SIZE_X; ++x)
- {
- DATA_TYPE data = *((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
- int data0 = convert_int(data);
- sdata = POOL_OP(sdata, data0);
- }
- }
-
- // Reduce result
- int4 reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
- int2 reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
- int res = POOL_OP(reduce2.s0, reduce2.s1);
- res = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG)
- res = round(DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y)));
-#endif /* defined(POOL_AVG) */
-
- DATA_TYPE result_q8 = CONVERT(res, DATA_TYPE);
-
-#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT)
-
- const float result_f32 = convert_float(result_q8);
- const float input_offset = (float)OFFSET_IN1;
- const float input_scale = (float)SCALE_IN1;
- const float scale_out = (float)SCALE_OUT;
- const float offset_out = (float)OFFSET_OUT;
- const float in_f32 = (result_f32 - input_offset) * input_scale;
- const float out_f32 = in_f32 / scale_out + offset_out;
- result_q8 = CONVERT_SAT(convert_int_rte(out_f32), DATA_TYPE);
-
-#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */
-
- *(__global DATA_TYPE *)output.ptr = result_q8;
-}
-
#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
* -# max, -DPOOL_MAX must be passed at compile time
diff --git a/src/core/CL/cl_kernels/remap.cl b/src/core/CL/cl_kernels/nhwc/remap.cl
index cb67c2df1e..0b629fe6c9 100644
--- a/src/core/CL/cl_kernels/remap.cl
+++ b/src/core/CL/cl_kernels/nhwc/remap.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2017, 2021 Arm Limited.
+ * Copyright (c) 2017-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,113 +24,7 @@
#include "helpers.h"
#include "warp_helpers.h"
-#ifndef DEPTH_OUT
-/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- * out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8.
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32.
- * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes)
- * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes)
- * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32.
- * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes)
- * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes)
- * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in] width Width of the input image
- * @param[in] height Height of the input image
- */
-__kernel void remap_nearest_neighbour_nchw(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- IMAGE_DECLARATION(mapx),
- IMAGE_DECLARATION(mapy),
- const float width,
- const float height)
-{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
- Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
- float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
- float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
- float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
- mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
- vstore4(read_texels4(&in, convert_int8(clamp_to_border(map_coords, width, height))), 0, out.ptr);
-}
-
-/** Performs a remapping of an input image to an output given two remapping image using bilinear as interpolation.
- *
- * This kernel performs remapping with this method of pixel coordinate translation:
- * out(x,y) = in(mapx(x,y), mapy(x,y));
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x in_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y in_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8.
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x out_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y out_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
- * @param[in] mapx_ptr Pointer to the x remapping image. Supported data types: F32.
- * @param[in] mapx_stride_x Stride of the remapping image in X dimension (in bytes)
- * @param[in] mapx_step_x mapx_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] mapx_stride_y Stride of the remapping image in Y dimension (in bytes)
- * @param[in] mapx_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] mapx_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in] mapy_ptr Pointer to the x remapping image. Supported data types: F32.
- * @param[in] mapy_stride_x Stride of the remapping image in X dimension (in bytes)
- * @param[in] mapy_step_x mapy_stride_x * number of elements along X processed per work item (in bytes)
- * @param[in] mapy_stride_y Stride of the remapping image in Y dimension (in bytes)
- * @param[in] mapy_step_y mapy_stride_y * number of elements along Y processed per work item (in bytes)
- * @param[in] mapy_offset_first_element_in_bytes Offset of the first element in the remapping image
- * @param[in] width Width of the input image
- * @param[in] height Height of the input image
- */
-__kernel void remap_bilinear_nchw(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- IMAGE_DECLARATION(mapx),
- IMAGE_DECLARATION(mapy),
- const float width,
- const float height)
-{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- Image mapx = CONVERT_TO_IMAGE_STRUCT(mapx);
- Image mapy = CONVERT_TO_IMAGE_STRUCT(mapy);
-
- float4 mapx_coords = vload4(0, (__global float *)mapx.ptr);
- float4 mapy_coords = vload4(0, (__global float *)mapy.ptr);
- float8 map_coords = (float8)(mapx_coords.s0, mapy_coords.s0, mapx_coords.s1, mapy_coords.s1,
- mapx_coords.s2, mapy_coords.s2, mapx_coords.s3, mapy_coords.s3);
-
- vstore4(bilinear_interpolate(&in, clamp_to_border(map_coords, width, height), width, height), 0, out.ptr);
-}
-#else // DEPTH_OUT
+#ifdef DEPTH_OUT
/** Performs a remapping of an input image to an output given two remapping image using nearest neighbor as interpolation.
* Also applies constant border value, "border_val", if "CONSTANT_BORDER" is set.
*
@@ -283,4 +177,4 @@ __kernel void remap_bilinear_nhwc(
*((__global DATA_TYPE *)out.ptr) = CONVERT(fr, DATA_TYPE);
}
-#endif // DEPTH_OUT
+#endif // DEPTH_OUT \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/reorg_layer.cl b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
new file mode 100644
index 0000000000..a340b0b8a2
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/reorg_layer.cl
@@ -0,0 +1,76 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE)
+
+#define CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi) \
+ ({ \
+ int offset = zo / (int)SRC_DEPTH; \
+ xi = xo * (int)STRIDE + offset % (int)STRIDE; \
+ yi = yo * (int)STRIDE + offset / (int)STRIDE; \
+ zi = zo % SRC_DEPTH; \
+ })
+
+/** Performs a reorganization layer of input tensor to the output tensor when the data layout is NHWC
+ *
+ * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float
+ * @note The depth of the input tensor must be passed at compile time using -DSRC_DEPTH: e.g. -DSRC_DEPTH=64
+ * @note The distance between 2 consecutive pixels along the x and y direction must be passed at compile time using -DSTRIDE: e.g. -DSTRIDE=2
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void reorg_layer_nhwc(
+ TENSOR3D_DECLARATION(src),
+ TENSOR3D_DECLARATION(dst))
+{
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(dst);
+
+ int xo = get_global_id(1);
+ int yo = get_global_id(2);
+ int zo = get_global_id(0);
+ int xi, yi, zi;
+
+ CALCULATE_SRC_COORDINATES(xo, yo, zo, xi, yi, zi);
+
+ int src_offset = zi * sizeof(DATA_TYPE) + xi * src_stride_y + yi * src_stride_z;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)(src_ptr + src_offset_first_element_in_bytes + src_offset));
+}
+#endif // // defined(DATA_TYPE) && defined(SRC_DEPTH) && defined(STRIDE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/scale.cl b/src/core/CL/cl_kernels/nhwc/scale.cl
index d4c27e6cf6..1ea5e73df1 100644
--- a/src/core/CL/cl_kernels/scale.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2016-2020 Arm Limited.
+ * Copyright (c) 2016-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,129 +24,6 @@
#include "helpers.h"
#include "warp_helpers.h"
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_nearest(const float2 coord, const float2 scale)
-{
-#ifdef SAMPLING_POLICY_TOP_LEFT
- const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
- const float4 new_x = in_x_coords * (float4)(scale.s0);
- const float4 new_y = (float4)(coord.s1 * scale.s1);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
- const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
- const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0);
- const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear(const float2 coord, const float2 scale)
-{
- const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
- const float4 new_x = in_x_coords * (float4)(scale.s0);
- const float4 new_y = (float4)(coord.s1 * scale.s1);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
- const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
- const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel U8 or S16.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
- */
-__kernel void scale_nearest_neighbour_nchw(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
-{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(scale_x, scale_y);
- float8 transformed = transform_nearest(get_current_coords(), r);
-#ifdef ALIGN_CORNERS
- transformed = round(transformed);
-#endif // ALIGN_CORNERS
- const float8 tc = clamp_to_border_with_size(transformed, input_width, input_height, BORDER_SIZE);
- vstore4(read_texels4(&in, convert_int8(tc)), 0, (__global DATA_TYPE *)out.ptr);
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: U8, S16.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
- */
-__kernel void scale_bilinear_nchw(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
-{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(scale_x, scale_y);
- const float8 tc = transform_bilinear(get_current_coords(), r);
- vstore4(bilinear_interpolate_with_border(&in, tc, input_width, input_height, BORDER_SIZE), 0, (__global DATA_TYPE *)out.ptr);
-}
-
#if defined(DEPTH_OUT)
/** Performs scale on an image interpolating with the NEAREAST NEIGHBOUR method. Input and output are single channel F32. (NHWC)
*
@@ -262,22 +139,22 @@ __kernel void scale_bilinear_nhwc(
const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
#ifndef BORDER_MODE_REPLICATE
- const bool check_x = (0.f <= new_xf && new_xf < input_width);
+ const bool check_x = (0.f <= new_xf && new_xf < input_width);
const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
- const bool check_y = (0.f <= new_yf && new_yf < input_height);
+ const bool check_y = (0.f <= new_yf && new_yf < input_height);
const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
- const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y);
+ const float ins_0 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+ (get_global_id(2) / DEPTH_OUT)))),
+ check_x && check_y);
const float ins_1 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
(get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y);
+ check_x1 && check_y);
const float ins_2 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
(get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y1);
+ check_x && check_y1);
const float ins_3 = select((float)(CONSTANT_VALUE), (float)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
(get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y1);
+ check_x1 && check_y1);
float4 ins = (float4)(ins_0, ins_1, ins_2, ins_3);
#else /* BORDER_MODE_REPLICATE */
float4 ins = (float4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
diff --git a/src/core/CL/cl_kernels/scale_quantized.cl b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
index 010e4ed57a..de9bb607b0 100644
--- a/src/core/CL/cl_kernels/scale_quantized.cl
+++ b/src/core/CL/cl_kernels/nhwc/scale_quantized.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -24,67 +24,6 @@
#include "helpers_asymm.h"
#include "warp_helpers_quantized.h"
-/** Transforms four 2D coordinates. This is used to map the output coordinates to the input coordinates.
- *
- * @param[in] coord 2D coordinates to transform.
- * @param[in] scale input/output scale ratio
- *
- * @return a float8 containing 4 2D transformed values in the input image.
- */
-inline const float8 transform_bilinear_quantized(const float2 coord, const float2 scale)
-{
- const float4 in_x_coords = (float4)(coord.s0, 1 + coord.s0, 2 + coord.s0, 3 + coord.s0);
-#ifdef SAMPLING_POLICY_TOP_LEFT
- const float4 new_x = in_x_coords * (float4)(scale.s0);
- const float4 new_y = (float4)(coord.s1 * scale.s1);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#elif SAMPLING_POLICY_CENTER
- const float4 new_x = (in_x_coords + ((float4)(0.5f))) * (float4)(scale.s0) - (float4)(0.5f);
- const float4 new_y = (float4)((coord.s1 + 0.5f) * scale.s1 - 0.5f);
- return (float8)(new_x.s0, new_y.s0, new_x.s1, new_y.s1, new_x.s2, new_y.s2, new_x.s3, new_y.s3);
-#else /* SAMPLING_POLICY */
-#error("Unsupported sampling policy");
-#endif /* SAMPLING_POLICY */
-}
-
-/** Performs an affine transformation on an image interpolating with the BILINEAR method.
- *
- * @note Sampling policy to used is passed as -DSAMPLING_POLICY_(TYPE) e.g. -DSAMPLING_POLICY_TOP_LEFT
- * @note Scale value for QASYMM8 data type to used is passed as -DSCALE=<VALUE> e.g. -DSCALE=0.5
- * @note Offset value for QASYMM8 data type to used is passed as -DOFFSET=<VALUE> e.g. -DOFFSET=1
- *
- * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8.
- * @param[in] in_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] in_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] in_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] in_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] in_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16. (Must be the same as the input)
- * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] out_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] out_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] input_width Input image width
- * @param[in] input_height Input image height
- * @param[in] scale_x The scale factor along x dimension
- * @param[in] scale_y The scale factor along y dimension
- */
-__kernel void scale_bilinear_quantized_nchw(
- IMAGE_DECLARATION(in),
- IMAGE_DECLARATION(out),
- const float input_width,
- const float input_height,
- const float scale_x,
- const float scale_y)
-{
- Image in = CONVERT_TO_IMAGE_STRUCT_NO_STEP(in);
- Image out = CONVERT_TO_IMAGE_STRUCT(out);
- const float2 r = (float2)(scale_x, scale_y);
- const float8 tc = transform_bilinear_quantized(get_current_coords_quantized(), r);
- vstore4(bilinear_interpolate_with_border_quantized(&in, tc, input_width, input_height, BORDER_SIZE, SCALE, OFFSET), 0, (__global DATA_TYPE *)out.ptr);
-}
-
#if defined(DEPTH_OUT)
/** Performs scale on an image interpolating with the BILINEAR method. (NHWC)
*
@@ -146,22 +85,22 @@ __kernel void scale_bilinear_quantized_nhwc(
const float clamped_y1 = clamp(new_yf + 1, 0.0f, input_height - 1);
#ifndef BORDER_MODE_REPLICATE
- const bool check_x = (0.f <= new_xf && new_xf < input_width);
- const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
- const bool check_y = (0.f <= new_yf && new_yf < input_height);
- const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
- const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
- (get_global_id(2) / DEPTH_OUT)))),
+ const bool check_x = (0.f <= new_xf && new_xf < input_width);
+ const bool check_x1 = (-1.f <= new_xf && new_xf < input_width - 1);
+ const bool check_y = (0.f <= new_yf && new_yf < input_height);
+ const bool check_y1 = (-1.f <= new_yf && new_yf < input_height - 1);
+ const int ins_0 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y),
+ (get_global_id(2) / DEPTH_OUT)))),
check_x && check_y);
const int ins_1 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y),
(get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y);
+ check_x1 && check_y);
const int ins_2 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y1),
(get_global_id(2) / DEPTH_OUT)))),
- check_x && check_y1);
+ check_x && check_y1);
const int ins_3 = select((int)(CONSTANT_VALUE), (int)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x1), convert_int(clamped_y1),
(get_global_id(2) / DEPTH_OUT)))),
- check_x1 && check_y1);
+ check_x1 && check_y1);
int4 ins = (int4)(ins_0, ins_1, ins_2, ins_3);
#else /* BORDER_MODE_REPLICATE */
int4 ins = (int4)(*((__global DATA_TYPE *)tensor4D_offset(&in, get_global_id(0), convert_int(clamped_x), convert_int(clamped_y), (get_global_id(2) / DEPTH_OUT))),
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_batch.cl b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
new file mode 100644
index 0000000000..785206e3b9
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_batch.cl
@@ -0,0 +1,155 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The block shape tensor rank must be passed at compile time using -DBLOCK_SHAPE_DIM. e.g. -DBLOCK_SHAPE_DIM=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] paddings_ptr Pointer to the second source image. Supported data types: S32
+ * @param[in] paddings_stride_x Stride of the paddinds tensor in X dimension (in bytes)
+ * @param[in] paddings_step_x paddings_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] paddings_stride_y Stride of the paddinds tensor in Y dimension (in bytes)
+ * @param[in] paddings_step_y paddings_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] paddingse_offset_first_element_in_bytes The offset of the first element in the second source image
+ * @param[in] block_shape_ptr Pointer to the block shape tensor. Supported data types: S32
+ * @param[in] block_shape_stride_x Stride of the block shape tensor in X dimension (in bytes)
+ * @param[in] block_shape_step_x block_shape_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] block_shape_offset_first_element_in_bytes The offset of the first element in the block shapetensor
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_nhwc(
+ TENSOR4D_DECLARATION(input),
+ IMAGE_DECLARATION(paddings),
+ VECTOR_DECLARATION(block_shape),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Image pad = CONVERT_TO_IMAGE_STRUCT_NO_STEP(paddings);
+ Vector block = CONVERT_TO_VECTOR_STRUCT_NO_STEP(block_shape);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int pad_left_x = *((__global int *)offset(&pad, 0, 0));
+ const int pad_right_x = *((__global int *)offset(&pad, 1, 0));
+ const int pad_left_y = *((__global int *)offset(&pad, 0, 1));
+ const int pad_right_y = *((__global int *)offset(&pad, 1, 1));
+
+ int block_x = *((__global int *)vector_offset(&block, 0));
+ int block_y = *((__global int *)vector_offset(&block, 1));
+
+ const int out_x = get_global_id(1);
+ const int out_y = get_global_id(2);
+ const int z = get_global_id(0);
+
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(((pos_y >= pad_left_y) && (pos_y < pad_left_y + HEIGHT_IN) && (pos_x >= pad_left_x) && (pos_x < pad_left_x + WIDTH_IN)))
+ {
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - pad_left_x;
+ const int in_y = pos_y - pad_left_y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+ }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+
+#if defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN)
+/** Calculate the space to batch conversion. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DBATCH_SIZE. e.g. -DBATCH_SIZE=2
+ * @note The block shape x must be passed at compile time using -DBLOCK_SHAPE_X. e.g. -DBLOCK_SHAPE_X=2
+ * @note The block shape y must be passed at compile time using -DBLOCK_SHAPE_Y. e.g. -DBLOCK_SHAPE_Y=2
+ * @note The starting pad value of x must be passed at compile time using -DPAD_LEFT_X. e.g. -DPAD_LEFT_X=2
+ * @note The ending pad value of x must be passed at compile time using -DPAD_RIGHT_X. e.g. -DPAD_RIGHT_X=2
+ * @note The starting pad value of y must be passed at compile time using -DPAD_LEFT_Y. e.g. -DPAD_LEFT_Y=2
+ * @note The ending pad value of y must be passed at compile time using -DPAD_RIGHT_Y. e.g. -DPAD_RIGHT_X=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source image
+ * @param[in] batch_id The output tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination image
+ */
+__kernel void space_to_batch_static_nhwc(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ int block_x = BLOCK_SHAPE_X;
+ int block_y = BLOCK_SHAPE_Y;
+
+ const int out_x = get_global_id(1);
+ const int out_y = get_global_id(2);
+ const int z = get_global_id(0);
+
+ const int pos_x = out_x * block_x + ((batch_id / BATCH_IN) % block_x);
+ const int pos_y = out_y * block_y + ((batch_id / BATCH_IN) / block_x);
+
+ if(pos_y >= PAD_LEFT_Y && pos_y < PAD_LEFT_Y + HEIGHT_IN && pos_x >= PAD_LEFT_X && pos_x < PAD_LEFT_X + WIDTH_IN)
+ {
+ const int w = batch_id % BATCH_IN;
+ const int in_x = pos_x - PAD_LEFT_X;
+ const int in_y = pos_y - PAD_LEFT_Y;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, w));
+ }
+}
+#endif // defined(BATCH_SIZE) && defined(DATA_TYPE) && defined(BLOCK_SHAPE_X) && defined(BLOCK_SHAPE_Y) && defined(PAD_LEFT_X) && defined(PAD_RIGHT_X) && defined(PAD_LEFT_Y) && defined(PAD_RIGHT_Y) && defined(WIDTH_IN) && defined(HEIGHT_IN) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/space_to_depth.cl b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
new file mode 100644
index 0000000000..d44e78d990
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/space_to_depth.cl
@@ -0,0 +1,69 @@
+/*
+ * Copyright (c) 2019-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE)
+/** Space to depth transformation. (NHWC)
+ *
+ * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float
+ * @note The input tensor batch size must be passed at compile time using -DCHANNEL_SIZE. e.g. -DCHANNEL_SIZE=2
+ * @note The block shape must be passed at compile time using -DBLOCK_SHAPE. e.g. -DBLOCK_SHAPE=2
+ *
+ * @param[in] input_ptr Pointer to the source tensor. Supported data types: All
+ * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] input_offset_first_element_in_bytes The offset of the first element in the first source tensor
+ * @param[in] batch_id The input tensor batch id
+ * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
+ * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void space_to_depth_nhwc(
+ TENSOR4D_DECLARATION(input),
+ const int batch_id,
+ TENSOR3D_DECLARATION(output))
+{
+ Tensor4D in = CONVERT_TO_TENSOR4D_STRUCT_NO_STEP(input, 0);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(output);
+
+ const int r = (CHANNEL_SIZE / (BLOCK_SHAPE * BLOCK_SHAPE));
+ const int x = get_global_id(1);
+ const int y = get_global_id(2);
+ const int z = get_global_id(0) % r;
+
+ const int in_x = x * BLOCK_SHAPE + (get_global_id(0) / r) % BLOCK_SHAPE;
+ const int in_y = y * BLOCK_SHAPE + (get_global_id(0) / r) / BLOCK_SHAPE;
+
+ *((__global DATA_TYPE *)out.ptr) = *((__global DATA_TYPE *)tensor4D_offset(&in, z, in_x, in_y, batch_id));
+}
+#endif // defined(DATA_TYPE) && defined(BLOCK_SHAPE) && defined(CHANNEL_SIZE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/upsample_layer.cl b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
index d0cc0f24b7..74b9674a88 100644
--- a/src/core/CL/cl_kernels/upsample_layer.cl
+++ b/src/core/CL/cl_kernels/nhwc/upsample_layer.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2020 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -23,61 +23,6 @@
*/
#include "helpers.h"
-/** This function applies upsample on an input image. (NCHW)
- *
- * @attention The following variables must be passed at compile time:
- * -# -DDATA_TYPE = Tensor data type. Supported data types: All
- * -# -DVEC_SIZE_IN = Input vector size
- * -# -DVEC_SIZE_OUT = Output vector size
- * -# -DLAST_ACCESSED_X_IN = The input element that is on the X border (threads trying to set this, might need to step back a bit)
- * -# -DLAST_ACCESSED_X_OUT = The output element that is on the X border (threads trying to set this, might need to step back a bit)
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: All
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_ptr Pointer to the destination image. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void upsample_layer_nchw(
- TENSOR3D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- Tensor3D src = CONVERT_TO_TENSOR3D_STRUCT(src);
- Tensor3D dst = CONVERT_TO_TENSOR3D_STRUCT(dst);
-
-#if defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
- // Check if access on width gets out of bounds
- // If it does shift access vector to access elements within bounds
- const int xi_in = (int)(get_global_id(0) * VEC_SIZE_IN);
- const int xi_out = (int)(get_global_id(0) * VEC_SIZE_OUT);
- src.ptr -= max(xi_in - (int)LAST_ACCESSED_X_IN, 0) * src_stride_x;
- dst.ptr -= max(xi_out - (int)LAST_ACCESSED_X_OUT, 0) * dst_stride_x;
-
- VEC_DATA_TYPE(DATA_TYPE, 8)
- data = vload8(0, (__global DATA_TYPE *)src.ptr);
-
- VEC_DATA_TYPE(DATA_TYPE, 16)
- data_out = (VEC_DATA_TYPE(DATA_TYPE, 16))(data.s0, data.s0, data.s1, data.s1, data.s2, data.s2, data.s3, data.s3, data.s4, data.s4, data.s5, data.s5, data.s6, data.s6, data.s7, data.s7);
-
- vstore16(data_out, 0, (__global DATA_TYPE *)dst.ptr);
- vstore16(data_out, 0, (__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0));
-#else // !defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
- *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 0)) = *((__global DATA_TYPE *)src.ptr);
- *((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 0)) = *((__global DATA_TYPE *)src.ptr);
-#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
-
/** This function applies upsample on an input image. (NHWC)
*
* @attention The following variables must be passed at compile time:
@@ -132,4 +77,4 @@ __kernel void upsample_layer_nhwc(
*((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 0, 1)) = *((__global DATA_TYPE *)src.ptr);
*((__global DATA_TYPE *)tensor3D_offset(&dst, 0, 1, 1)) = *((__global DATA_TYPE *)src.ptr);
#endif // defined(VEC_SIZE_IN) && defined(VEC_SIZE_OUT) && defined(LAST_ACCESSED_X_IN) && defined(LAST_ACCESSED_X_OUT)
-}
+} \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/winograd_filter_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
index 5c3bb8aa9b..8d5fd3437f 100644
--- a/src/core/CL/cl_kernels/winograd_filter_transform.cl
+++ b/src/core/CL/cl_kernels/nhwc/winograd_filter_transform.cl
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2019 Arm Limited.
+ * Copyright (c) 2018-2021 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -37,288 +37,6 @@
out.s7 = tmp.s6; \
})
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 2x2/2x1/1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x2_3x3_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
- const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
- // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
- *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
- *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
- // Row 0
- VEC_DATA_TYPE(DATA_TYPE, 4)
- out0 = 0.0f;
- out0.s0 = (w0.s0);
- out0.s1 = (w0.s0 + w0.s1 + w0.s2) * 0.5f;
- out0.s2 = (w0.s0 + w0.s2 - w0.s1) * 0.5f;
- out0.s3 = (w0.s2);
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- // Row 1
- VEC_DATA_TYPE(DATA_TYPE, 4)
- out1 = 0.0f;
- out1.s0 = (w0.s0 + w1.s0 + w2.s0) * 0.5f;
- out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) * 0.25f;
- out1.s2 = (w0.s0 + w1.s0 + w2.s0 + w0.s2 + w1.s2 + w2.s2 - w0.s1 - w1.s1 - w2.s1) * 0.25f;
- out1.s3 = (w0.s2 + w1.s2 + w2.s2) * 0.5f;
-
- // Row 2
- VEC_DATA_TYPE(DATA_TYPE, 4)
- out2 = 0.0f;
- out2.s0 = (w0.s0 + w2.s0 - w1.s0) * 0.5f;
- out2.s1 = (w0.s0 + w2.s0 + w0.s1 + w2.s1 + w0.s2 + w2.s2 - w1.s0 - w1.s1 - w1.s2) * 0.25f;
- out2.s2 = (w0.s0 + w2.s0 + w1.s1 + w0.s2 + w2.s2 - w1.s0 - w0.s1 - w2.s1 - w1.s2) * 0.25f;
- out2.s3 = (w0.s2 + w2.s2 - w1.s2) * 0.5f;
-
- // Row 3
- VEC_DATA_TYPE(DATA_TYPE, 4)
- out3 = 0.0f;
- out3.s0 = (w2.s0);
- out3.s1 = (w2.s0 + w2.s1 + w2.s2) * 0.5f;
- out3.s2 = (w2.s0 + w2.s2 - w2.s1) * 0.5f;
- out3.s3 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
- int z = get_global_id(2);
- int x0 = z / SRC_DIM_Z; // idx filter
- int y0 = z % SRC_DIM_Z; // idx channel
-
- // Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
- // Store the values across the channels
- // 16 channels for 3x3 kernels
- // 4 channels for 3x1 or 1x3 kernels
- *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out1.s0;
- *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out1.s1;
- *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s2;
- *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s3;
- *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out2.s0;
- *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out2.s1;
- *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out2.s2;
- *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out2.s3;
- *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out3.s0;
- *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out3.s1;
- *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out3.s2;
- *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out3.s3;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NCHW and the output tile is 4x4/4x1/1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note If this kernel is used to perform Winograd filter transform 3x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x3, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_3x3_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
- const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
- // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w0 = vload3(0, (__global DATA_TYPE *)(src_addr));
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w0 = (VEC_DATA_TYPE(DATA_TYPE, 3))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
- *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
- *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)));
-#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w0 = vload3(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w1 = vload3(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
- VEC_DATA_TYPE(DATA_TYPE, 3)
- w2 = vload3(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
- // Row 0
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out0 = 0.0f;
- out0.s0 = (w0.s0) / 16.f;
- out0.s1 = (-w0.s0 - w0.s1 - w0.s2) / 24.f;
- out0.s2 = (-w0.s0 + w0.s1 - w0.s2) / 24.f;
- out0.s3 = (w0.s0 + 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
- out0.s4 = (w0.s0 - 2.f * w0.s1 + 4.f * w0.s2) / 96.f;
- out0.s5 = (w0.s2) / 4.f;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- // Row 1
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out1 = 0.0f;
- out1.s0 = (-w0.s0 - w1.s0 - w2.s0) / 24.f;
- out1.s1 = (w0.s0 + w1.s0 + w2.s0 + w0.s1 + w1.s1 + w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
- out1.s2 = (w0.s0 + w1.s0 + w2.s0 - w0.s1 - w1.s1 - w2.s1 + w0.s2 + w1.s2 + w2.s2) / 36.f;
- out1.s3 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (-w0.s1 - w1.s1 - w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
- out1.s4 = (-w0.s0 - w1.s0 - w2.s0 + 2.f * (w0.s1 + w1.s1 + w2.s1) + 4.f * (-w0.s2 - w1.s2 - w2.s2)) / 144.f;
- out1.s5 = (-w0.s2 - w1.s2 - w2.s2) / 6.f;
-
- // Row 2
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out2 = 0.0f;
- out2.s0 = (-w0.s0 + w1.s0 - w2.s0) / 24.f;
- out2.s1 = (w0.s0 - w1.s0 + w2.s0 + w0.s1 - w1.s1 + w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
- out2.s2 = (w0.s0 - w1.s0 + w2.s0 - w0.s1 + w1.s1 - w2.s1 + w0.s2 - w1.s2 + w2.s2) / 36.f;
- out2.s3 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (-w0.s1 + w1.s1 - w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
- out2.s4 = (-w0.s0 + w1.s0 - w2.s0 + 2.f * (w0.s1 - w1.s1 + w2.s1) + 4.f * (-w0.s2 + w1.s2 - w2.s2)) / 144.f;
- out2.s5 = (-w0.s2 + w1.s2 - w2.s2) / 6.f;
-
- // Row 3
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out3 = 0.0f;
- out3.s0 = (w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
- out3.s1 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 - 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out3.s2 = (-w0.s0 - 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 + 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 - 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out3.s3 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 + 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out3.s4 = ((w0.s0 + 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 - 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out3.s5 = (w0.s2 + 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
- // Row 4
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out4 = 0.0f;
- out4.s0 = (w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) / 96.f;
- out4.s1 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 - w0.s1 + 2.f * w1.s1 - 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out4.s2 = (-w0.s0 + 2.f * w1.s0 - 4.f * w2.s0 + w0.s1 - 2.f * w1.s1 + 4.f * w2.s1 - w0.s2 + 2.f * w1.s2 - 4.f * w2.s2) / 144.f;
- out4.s3 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (w0.s1 - 2.f * w1.s1 + 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out4.s4 = ((w0.s0 - 2.f * w1.s0 + 4.f * w2.s0) + 2.f * (-w0.s1 + 2.f * w1.s1 - 4.f * w2.s1) + 4.f * (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2)) / 576.f;
- out4.s5 = (w0.s2 - 2.f * w1.s2 + 4.f * w2.s2) / 24.f;
-
- // Row 5
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out5 = 0.0f;
- out5.s0 = (w2.s0) / 4.f;
- out5.s1 = (-w2.s0 - w2.s1 - w2.s2) / 6.f;
- out5.s2 = (-w2.s0 + w2.s1 - w2.s2) / 6.f;
- out5.s3 = (w2.s0 + 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
- out5.s4 = (w2.s0 - 2.f * w2.s1 + 4.f * w2.s2) / 24.f;
- out5.s5 = (w2.s2);
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
- int z = get_global_id(2);
- int x0 = z / SRC_DIM_Z; // idx filter
- int y0 = z % SRC_DIM_Z; // idx channel
-
- // Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * dst_stride_x + y0 * dst_stride_y;
-
- // Store the values across the channels
- // 36 channels for 3x3 kernels
- // 6 channels for 3x1 or 1x3 kernels
- *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
- *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
- *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out1.s0;
- *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out1.s1;
- *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s2;
- *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s3;
- *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s4;
- *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s5;
- *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out2.s0;
- *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out2.s1;
- *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out2.s2;
- *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out2.s3;
- *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s4;
- *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s5;
- *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out3.s0;
- *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out3.s1;
- *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out3.s2;
- *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out3.s3;
- *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out3.s4;
- *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out3.s5;
- *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out4.s0;
- *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out4.s1;
- *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out4.s2;
- *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out4.s3;
- *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out4.s4;
- *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out4.s5;
- *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out5.s0;
- *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out5.s1;
- *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out5.s2;
- *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out5.s3;
- *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out5.s4;
- *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out5.s5;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
/** This OpenCL kernel performs Winograd filter transform 3x3/3x1/1x3 when the data layout is NHWC and the output tile is 4x4/4x1/1x4
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -477,308 +195,6 @@ __kernel void winograd_filter_transform_4x4_3x3_nhwc(
#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
}
-/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NCHW and the output tile is 4x4/4x1 or 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- *
- * @note If this kernel is used to perform Winograd filter transform 5x1, -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time
- * @note If this kernel is used to perform Winograd filter transform 1x5, -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x4_5x5_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- Tensor4D src = CONVERT_TO_TENSOR4D_STRUCT(src, SRC_DIM_Z);
-
- const __global uchar *src_addr = tensor4D_offset(&src, 0, 0, 0, 0);
-
- // Load the values from the input tensor
-#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
- VEC_DATA_TYPE(DATA_TYPE, 4)
- w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
- DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
-#elif defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- VEC_DATA_TYPE(DATA_TYPE, 4)
- w00 = (VEC_DATA_TYPE(DATA_TYPE, 4))(*((__global DATA_TYPE *)(src_addr + 0 * src_stride_y)),
- *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y)),
- *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y)),
- *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y)));
- DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
-#else // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- VEC_DATA_TYPE(DATA_TYPE, 4)
- w00 = vload4(0, (__global DATA_TYPE *)(src_addr + 0 * src_stride_y));
- DATA_TYPE w01 = *((__global DATA_TYPE *)(src_addr + 0 * src_stride_y) + 4);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- w10 = vload4(0, (__global DATA_TYPE *)(src_addr + 1 * src_stride_y));
- DATA_TYPE w11 = *((__global DATA_TYPE *)(src_addr + 1 * src_stride_y) + 4);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- w20 = vload4(0, (__global DATA_TYPE *)(src_addr + 2 * src_stride_y));
- DATA_TYPE w21 = *((__global DATA_TYPE *)(src_addr + 2 * src_stride_y) + 4);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- w30 = vload4(0, (__global DATA_TYPE *)(src_addr + 3 * src_stride_y));
- DATA_TYPE w31 = *((__global DATA_TYPE *)(src_addr + 3 * src_stride_y) + 4);
- VEC_DATA_TYPE(DATA_TYPE, 4)
- w40 = vload4(0, (__global DATA_TYPE *)(src_addr + 4 * src_stride_y));
- DATA_TYPE w41 = *((__global DATA_TYPE *)(src_addr + 4 * src_stride_y) + 4);
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-
- // Transform the input tile
-
- // Row 0
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out0 = 0.0f;
- out0.s0 = w00.s0;
- out0.s1 = -2.f * (w00.s0 + w00.s1 + w00.s2 + w00.s3 + w01) / 9.f;
- out0.s2 = -2.f * (w00.s0 - w00.s1 + w00.s2 - w00.s3 + w01) / 9.f;
- out0.s3 = (w00.s0 + 2.f * w00.s1 + 4.f * w00.s2 + 8.f * w00.s3 + 16.f * w01) / 90.f;
- out0.s4 = (w00.s0 - 2.f * w00.s1 + 4.f * w00.s2 - 8.f * w00.s3 + 16.f * w01) / 90.f;
- out0.s5 = (16.f * w00.s0 + 8.f * w00.s1 + 4.f * w00.s2 + 2.f * w00.s3 + w01) / 180.f;
- out0.s6 = (16.f * w00.s0 - 8.f * w00.s1 + 4.f * w00.s2 - 2.f * w00.s3 + w01) / 180.f;
- out0.s7 = w01;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- // Row 1
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out1 = 0.0f;
- out1.s0 = -2.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) / 9.f;
- out1.s1 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) +
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
- out1.s2 = 4.f * ((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) -
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 81.f;
- out1.s3 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 8.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
- out1.s4 = -((w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 2.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 8.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + 16.f * (w01 + w11 + w21 + w31 + w41)) / 405.f;
- out1.s5 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) + 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) + 2.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
- out1.s6 = -(16.f * (w00.s0 + w10.s0 + w20.s0 + w30.s0 + w40.s0) - 8.f * (w00.s1 + w10.s1 + w20.s1 + w30.s1 + w40.s1) + 4.f * (w00.s2 + w10.s2 + w20.s2 + w30.s2 + w40.s2) - 2.f *
- (w00.s3 + w10.s3 + w20.s3 + w30.s3 + w40.s3) + (w01 + w11 + w21 + w31 + w41)) / 810.f;
- out1.s7 = -2.f * (w01 + w11 + w21 + w31 + w41) / 9.f;
-
- // Row 2
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out2 = 0.0f;
- out2.s0 = -2.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) / 9.f;
- out2.s1 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) +
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
- out2.s2 = 4.f * ((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) -
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 81.f;
- out2.s3 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 8.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
- out2.s4 = -((w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 2.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 8.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + 16.f * (w01 - w11 + w21 - w31 + w41)) / 405.f;
- out2.s5 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) + 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) + 2.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
- out2.s6 = -(16.f * (w00.s0 - w10.s0 + w20.s0 - w30.s0 + w40.s0) - 8.f * (w00.s1 - w10.s1 + w20.s1 - w30.s1 + w40.s1) + 4.f * (w00.s2 - w10.s2 + w20.s2 - w30.s2 + w40.s2) - 2.f *
- (w00.s3 - w10.s3 + w20.s3 - w30.s3 + w40.s3) + (w01 - w11 + w21 - w31 + w41)) / 810.f;
- out2.s7 = -2.f * (w01 - w11 + w21 - w31 + w41) / 9.f;
-
- // Row 3
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out3 = 0.0f;
- out3.s0 = (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
- out3.s1 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
- out3.s2 = -((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 405.f;
- out3.s3 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
- out3.s4 = ((w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 8100.f;
- out3.s5 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
- out3.s6 = (16.f * (w00.s0 + 2.f * w10.s0 + 4.f * w20.s0 + 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 + 2.f * w10.s1 + 4.f * w20.s1 + 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 + 2.f * w10.s2 + 4.f * w20.s2 + 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 + 2.f * w10.s3 + 4.f * w20.s3 + 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41)) / 16200.f;
- out3.s7 = (w01 + 2.f * w11 + 4.f * w21 + 8.f * w31 + 16.f * w41) / 90.f;
-
- // Row 4
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out4 = 0.0f;
- out4.s0 = (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) / 90.f;
- out4.s1 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
- out4.s2 = -((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) +
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 405.f;
- out4.s3 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
- out4.s4 = ((w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 2.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 8.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) + 16.f *
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 8100.f;
- out4.s5 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) + 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) + 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
- out4.s6 = (16.f * (w00.s0 - 2.f * w10.s0 + 4.f * w20.s0 - 8.f * w30.s0 + 16.f * w40.s0) - 8.f * (w00.s1 - 2.f * w10.s1 + 4.f * w20.s1 - 8.f * w30.s1 + 16.f * w40.s1) + 4.f *
- (w00.s2 - 2.f * w10.s2 + 4.f * w20.s2 - 8.f * w30.s2 + 16.f * w40.s2) - 2.f * (w00.s3 - 2.f * w10.s3 + 4.f * w20.s3 - 8.f * w30.s3 + 16.f * w40.s3) +
- (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41)) / 16200.f;
- out4.s7 = (w01 - 2.f * w11 + 4.f * w21 - 8.f * w31 + 16.f * w41) / 90.f;
-
- // Row 5
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out5 = 0.0f;
- out5.s0 = (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) / 180.f;
- out5.s1 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
- out5.s2 = -((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 810.f;
- out5.s3 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
- out5.s4 = ((16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 16200.f;
- out5.s5 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
- out5.s6 = (16.f * (16.f * w00.s0 + 8.f * w10.s0 + 4.f * w20.s0 + 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 + 8.f * w10.s1 + 4.f * w20.s1 + 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 + 8.f * w10.s2 + 4.f * w20.s2 + 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 + 8.f * w10.s3 + 4.f * w20.s3 + 2.f * w30.s3 + w40.s3) +
- (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41)) / 32400.f;
- out5.s7 = (16.f * w01 + 8.f * w11 + 4.f * w21 + 2.f * w31 + w41) / 180.f;
-
- // Row 6
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out6 = 0.0f;
- out6.s0 = (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) / 180.f;
- out6.s1 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
- out6.s2 = -((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) +
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 810.f;
- out6.s3 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
- out6.s4 = ((16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 2.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 8.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) + 16.f *
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 16200.f;
- out6.s5 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) + 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) + 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
- out6.s6 = (16.f * (16.f * w00.s0 - 8.f * w10.s0 + 4.f * w20.s0 - 2.f * w30.s0 + w40.s0) - 8.f * (16.f * w00.s1 - 8.f * w10.s1 + 4.f * w20.s1 - 2.f * w30.s1 + w40.s1) + 4.f *
- (16.f * w00.s2 - 8.f * w10.s2 + 4.f * w20.s2 - 2.f * w30.s2 + w40.s2) - 2.f * (16.f * w00.s3 - 8.f * w10.s3 + 4.f * w20.s3 - 2.f * w30.s3 + w40.s3) +
- (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41)) / 32400.f;
- out6.s7 = (16.f * w01 - 8.f * w11 + 4.f * w21 - 2.f * w31 + w41) / 180.f;
-
- // Row 7
- VEC_DATA_TYPE(DATA_TYPE, 8)
- out7 = 0.0f;
- out7.s0 = w40.s0;
- out7.s1 = -2.f * (w40.s0 + w40.s1 + w40.s2 + w40.s3 + w41) / 9.f;
- out7.s2 = -2.f * (w40.s0 - w40.s1 + w40.s2 - w40.s3 + w41) / 9.f;
- out7.s3 = (w40.s0 + 2.f * w40.s1 + 4.f * w40.s2 + 8.f * w40.s3 + 16.f * w41) / 90.f;
- out7.s4 = (w40.s0 - 2.f * w40.s1 + 4.f * w40.s2 - 8.f * w40.s3 + 16.f * w41) / 90.f;
- out7.s5 = (16.f * w40.s0 + 8.f * w40.s1 + 4.f * w40.s2 + 2.f * w40.s3 + w41) / 180.f;
- out7.s6 = (16.f * w40.s0 - 8.f * w40.s1 + 4.f * w40.s2 - 2.f * w40.s3 + w41) / 180.f;
- out7.s7 = w41;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-
- int z = get_global_id(2);
- int x0 = z / SRC_DIM_Z; // idx filter
- int y0 = z % SRC_DIM_Z; // idx channel
-
- // Get output address
- __global uchar *dst_addr = dst_ptr + dst_offset_first_element_in_bytes + x0 * sizeof(DATA_TYPE) + y0 * dst_stride_y;
-
- // Store the values across the channels
- *(__global DATA_TYPE *)(dst_addr + 0 * dst_stride_z) = out0.s0;
- *(__global DATA_TYPE *)(dst_addr + 1 * dst_stride_z) = out0.s1;
- *(__global DATA_TYPE *)(dst_addr + 2 * dst_stride_z) = out0.s2;
- *(__global DATA_TYPE *)(dst_addr + 3 * dst_stride_z) = out0.s3;
- *(__global DATA_TYPE *)(dst_addr + 4 * dst_stride_z) = out0.s4;
- *(__global DATA_TYPE *)(dst_addr + 5 * dst_stride_z) = out0.s5;
- *(__global DATA_TYPE *)(dst_addr + 6 * dst_stride_z) = out0.s6;
- *(__global DATA_TYPE *)(dst_addr + 7 * dst_stride_z) = out0.s7;
-
-#if !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
- *(__global DATA_TYPE *)(dst_addr + 8 * dst_stride_z) = out1.s0;
- *(__global DATA_TYPE *)(dst_addr + 9 * dst_stride_z) = out1.s1;
- *(__global DATA_TYPE *)(dst_addr + 10 * dst_stride_z) = out1.s2;
- *(__global DATA_TYPE *)(dst_addr + 11 * dst_stride_z) = out1.s3;
- *(__global DATA_TYPE *)(dst_addr + 12 * dst_stride_z) = out1.s4;
- *(__global DATA_TYPE *)(dst_addr + 13 * dst_stride_z) = out1.s5;
- *(__global DATA_TYPE *)(dst_addr + 14 * dst_stride_z) = out1.s6;
- *(__global DATA_TYPE *)(dst_addr + 15 * dst_stride_z) = out1.s7;
- *(__global DATA_TYPE *)(dst_addr + 16 * dst_stride_z) = out2.s0;
- *(__global DATA_TYPE *)(dst_addr + 17 * dst_stride_z) = out2.s1;
- *(__global DATA_TYPE *)(dst_addr + 18 * dst_stride_z) = out2.s2;
- *(__global DATA_TYPE *)(dst_addr + 19 * dst_stride_z) = out2.s3;
- *(__global DATA_TYPE *)(dst_addr + 20 * dst_stride_z) = out2.s4;
- *(__global DATA_TYPE *)(dst_addr + 21 * dst_stride_z) = out2.s5;
- *(__global DATA_TYPE *)(dst_addr + 22 * dst_stride_z) = out2.s6;
- *(__global DATA_TYPE *)(dst_addr + 23 * dst_stride_z) = out2.s7;
- *(__global DATA_TYPE *)(dst_addr + 24 * dst_stride_z) = out3.s0;
- *(__global DATA_TYPE *)(dst_addr + 25 * dst_stride_z) = out3.s1;
- *(__global DATA_TYPE *)(dst_addr + 26 * dst_stride_z) = out3.s2;
- *(__global DATA_TYPE *)(dst_addr + 27 * dst_stride_z) = out3.s3;
- *(__global DATA_TYPE *)(dst_addr + 28 * dst_stride_z) = out3.s4;
- *(__global DATA_TYPE *)(dst_addr + 29 * dst_stride_z) = out3.s5;
- *(__global DATA_TYPE *)(dst_addr + 30 * dst_stride_z) = out3.s6;
- *(__global DATA_TYPE *)(dst_addr + 31 * dst_stride_z) = out3.s7;
- *(__global DATA_TYPE *)(dst_addr + 32 * dst_stride_z) = out4.s0;
- *(__global DATA_TYPE *)(dst_addr + 33 * dst_stride_z) = out4.s1;
- *(__global DATA_TYPE *)(dst_addr + 34 * dst_stride_z) = out4.s2;
- *(__global DATA_TYPE *)(dst_addr + 35 * dst_stride_z) = out4.s3;
- *(__global DATA_TYPE *)(dst_addr + 36 * dst_stride_z) = out4.s4;
- *(__global DATA_TYPE *)(dst_addr + 37 * dst_stride_z) = out4.s5;
- *(__global DATA_TYPE *)(dst_addr + 38 * dst_stride_z) = out4.s6;
- *(__global DATA_TYPE *)(dst_addr + 39 * dst_stride_z) = out4.s7;
- *(__global DATA_TYPE *)(dst_addr + 40 * dst_stride_z) = out5.s0;
- *(__global DATA_TYPE *)(dst_addr + 41 * dst_stride_z) = out5.s1;
- *(__global DATA_TYPE *)(dst_addr + 42 * dst_stride_z) = out5.s2;
- *(__global DATA_TYPE *)(dst_addr + 43 * dst_stride_z) = out5.s3;
- *(__global DATA_TYPE *)(dst_addr + 44 * dst_stride_z) = out5.s4;
- *(__global DATA_TYPE *)(dst_addr + 45 * dst_stride_z) = out5.s5;
- *(__global DATA_TYPE *)(dst_addr + 46 * dst_stride_z) = out5.s6;
- *(__global DATA_TYPE *)(dst_addr + 47 * dst_stride_z) = out5.s7;
- *(__global DATA_TYPE *)(dst_addr + 48 * dst_stride_z) = out6.s0;
- *(__global DATA_TYPE *)(dst_addr + 49 * dst_stride_z) = out6.s1;
- *(__global DATA_TYPE *)(dst_addr + 50 * dst_stride_z) = out6.s2;
- *(__global DATA_TYPE *)(dst_addr + 51 * dst_stride_z) = out6.s3;
- *(__global DATA_TYPE *)(dst_addr + 52 * dst_stride_z) = out6.s4;
- *(__global DATA_TYPE *)(dst_addr + 53 * dst_stride_z) = out6.s5;
- *(__global DATA_TYPE *)(dst_addr + 54 * dst_stride_z) = out6.s6;
- *(__global DATA_TYPE *)(dst_addr + 55 * dst_stride_z) = out6.s7;
- *(__global DATA_TYPE *)(dst_addr + 56 * dst_stride_z) = out7.s0;
- *(__global DATA_TYPE *)(dst_addr + 57 * dst_stride_z) = out7.s1;
- *(__global DATA_TYPE *)(dst_addr + 58 * dst_stride_z) = out7.s2;
- *(__global DATA_TYPE *)(dst_addr + 59 * dst_stride_z) = out7.s3;
- *(__global DATA_TYPE *)(dst_addr + 60 * dst_stride_z) = out7.s4;
- *(__global DATA_TYPE *)(dst_addr + 61 * dst_stride_z) = out7.s5;
- *(__global DATA_TYPE *)(dst_addr + 62 * dst_stride_z) = out7.s6;
- *(__global DATA_TYPE *)(dst_addr + 63 * dst_stride_z) = out7.s7;
-#endif // !defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-}
-
/** This OpenCL kernel performs Winograd filter transform 5x5/5x1 or 1x5 when the data layout is NHWC and the output tile is 4x4/4x1 or 1x4
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -1360,152 +776,6 @@ __kernel void winograd_filter_transform_2x2_7x7_nhwc(
#endif // defined(SRC_DIM_Z)
#if defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 2x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_2x1_3x1_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- winograd_filter_transform_2x2_3x3_nchw(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_3x1_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- winograd_filter_transform_4x4_3x3_nchw(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 5x1 when the data layout is NCHW and the output tile is 4x1
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_4x1_5x1_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- winograd_filter_transform_4x4_5x5_nchw(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_offset_first_element_in_bytes);
-}
/** This OpenCL kernel performs Winograd filter transform 3x1 when the data layout is NHWC and the output tile is 4x1
*
@@ -1656,153 +926,6 @@ __kernel void winograd_filter_transform_2x1_7x1_nhwc(
#endif // defined(WINOGRAD_FILTER_TRANSFORM_HORIZONTAL)
#if defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x2
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x2_1x3_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- winograd_filter_transform_2x2_3x3_nchw(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x3_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- winograd_filter_transform_4x4_3x3_nchw(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_offset_first_element_in_bytes);
-}
-
-/** This OpenCL kernel performs Winograd filter transform 1x5 when the data layout is NCHW and the output tile is 1x4
- *
- * @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
- * @note -DWINOGRAD_FILTER_TRANSFORM_VERTICAL has to be passed at compile time to perform Winograd Filter Transform
- * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
- *
- * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
- * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void winograd_filter_transform_1x4_1x5_nchw(
- TENSOR4D_DECLARATION(src),
- TENSOR3D_DECLARATION(dst))
-{
- winograd_filter_transform_4x4_5x5_nchw(src_ptr,
- src_stride_x,
- src_step_x,
- src_stride_y,
- src_step_y,
- src_stride_z,
- src_step_z,
- src_stride_w,
- src_step_w,
- src_offset_first_element_in_bytes,
- dst_ptr,
- dst_stride_x,
- dst_step_x,
- dst_stride_y,
- dst_step_y,
- dst_stride_z,
- dst_step_z,
- dst_offset_first_element_in_bytes);
-}
-
/** This OpenCL kernel performs Winograd filter transform 1x3 when the data layout is NHWC and the output tile is 1x4
*
* @note In order to correctly split the input tensor in batches, its dimension across the Z axis (channels for NCHW, height for NHWC) must be passed at compile time using -DSRC_DIM_Z: e.g. -DSRC_DIM_Z=64
@@ -1949,4 +1072,4 @@ __kernel void winograd_filter_transform_1x2_1x7_nhwc(
dst_step_z,
dst_offset_first_element_in_bytes);
}
-#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL)
+#endif // defined(WINOGRAD_FILTER_TRANSFORM_VERTICAL) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
new file mode 100644
index 0000000000..4865982a55
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_input_transform.cl
@@ -0,0 +1,953 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#define OUTPUT_ROW_4x4_5x5(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = tmp.s2 - 4.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = tmp.s1 - 4.25f * tmp.s3 + tmp.s5; \
+ comm_fact.s2 = 2.5f * tmp.s3; \
+ comm_fact.s3 = 0.5f * tmp.s1 + 2.f * tmp.s5 - comm_fact.s2; \
+ comm_fact.s4 = 0.25f * tmp.s2 - 1.25f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = 4.f * tmp.s2 + tmp.s6 - 5.f * tmp.s4; \
+ comm_fact.s6 = 2.f * tmp.s1 + 0.5f * tmp.s5 - comm_fact.s2; \
+ \
+ out.s0 = tmp.s0 - tmp.s6 + 5.25f * tmp.s4 - 5.25f * tmp.s2; \
+ out.s1 = comm_fact.s0 + comm_fact.s1; \
+ out.s2 = comm_fact.s0 - comm_fact.s1; \
+ out.s3 = comm_fact.s3 + comm_fact.s4; \
+ out.s4 = comm_fact.s4 - comm_fact.s3; \
+ out.s5 = comm_fact.s5 + comm_fact.s6; \
+ out.s6 = comm_fact.s5 - comm_fact.s6; \
+ out.s7 = tmp.s7 - tmp.s1 + 5.25f * tmp.s3 - 5.25f * tmp.s5; \
+ })
+
+#define OUTPUT_ROW_2x2_7x7(out, tmp, comm_fact) \
+ ({ \
+ comm_fact.s0 = 36.0f * tmp.s2 - 13.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s1 = 36.0f * tmp.s1 - 13.0f * tmp.s3 + 1.0f * tmp.s5; \
+ comm_fact.s2 = 9.0f * tmp.s2 - 10.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s3 = 18.0f * tmp.s1 - 20.0f * tmp.s3 + 2.0f * tmp.s5; \
+ comm_fact.s4 = 4.0f * tmp.s2 - 5.0f * tmp.s4 + tmp.s6; \
+ comm_fact.s5 = 12.0f * tmp.s1 - 15.0f * tmp.s3 + 3.0f * tmp.s5; \
+ out.s0 = -36.0f * tmp.s0 + 49.0f * tmp.s2 + -14.0f * tmp.s4 + tmp.s6; \
+ out.s1 = comm_fact.s0 - comm_fact.s1; \
+ out.s2 = comm_fact.s0 + comm_fact.s1; \
+ out.s3 = comm_fact.s2 - comm_fact.s3; \
+ out.s4 = comm_fact.s2 + comm_fact.s3; \
+ out.s5 = comm_fact.s4 - comm_fact.s5; \
+ out.s6 = comm_fact.s4 + comm_fact.s5; \
+ out.s7 = -36.0f * tmp.s1 + 0.0f * tmp.s2 + 49.0f * tmp.s3 - 14.0f * tmp.s5 + tmp.s7; \
+ })
+
+#if defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+
+#if defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_3x3_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+ int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+ int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+ x -= PAD_LEFT;
+ y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 6, 1, in);
+ TILE(DATA_TYPE, 6, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ in[i].v = 0;
+ })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 1, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 6, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+ TILE(DATA_TYPE, 6, 1, com);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ in[i].v *= 4.0f;
+ })
+
+ com[0].v = in[2].v - 4.f * in[0].v;
+ com[1].v = in[3].v - 4.f * in[1].v;
+ com[2].v = in[4].v - 4.f * in[2].v;
+ com[3].v = in[5].v - 4.f * in[3].v;
+ com[4].v = in[3].v - in[1].v;
+ com[4].v = com[4].v + com[4].v;
+ com[5].v = in[4].v - in[2].v;
+
+ out[0].v = com[2].v - com[0].v;
+ out[1].v = com[2].v + com[1].v;
+ out[2].v = com[2].v - com[1].v;
+ out[3].v = com[5].v + com[4].v;
+ out[4].v = com[5].v - com[4].v;
+ out[5].v = com[3].v - com[1].v;
+
+ TILE(uint, 6, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 6;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 6, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 36, 1, in);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the tile from a NHWC tensor
+ T_LOAD_NHWC(DATA_TYPE, 6, 6, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+ TILE(DATA_TYPE, 6, 1, com);
+ TILE(DATA_TYPE, 36, 1, tmp);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ com[0].v = in[2 * 6 + i].v - (DATA_TYPE)4.0f * in[0 * 6 + i].v;
+ com[1].v = in[3 * 6 + i].v - (DATA_TYPE)4.0f * in[1 * 6 + i].v;
+ com[2].v = in[4 * 6 + i].v - (DATA_TYPE)4.0f * in[2 * 6 + i].v;
+ com[3].v = in[5 * 6 + i].v - (DATA_TYPE)4.0f * in[3 * 6 + i].v;
+ com[4].v = in[3 * 6 + i].v - in[1 * 6 + i].v;
+ com[4].v = com[4].v + com[4].v;
+ com[5].v = in[4 * 6 + i].v - in[2 * 6 + i].v;
+ tmp[i + 0 * 6].v = com[2].v - com[0].v;
+ tmp[i + 1 * 6].v = com[2].v + com[1].v;
+ tmp[i + 2 * 6].v = com[2].v - com[1].v;
+ tmp[i + 3 * 6].v = com[5].v + com[4].v;
+ tmp[i + 4 * 6].v = com[5].v - com[4].v;
+ tmp[i + 5 * 6].v = com[3].v - com[1].v;
+ })
+
+ TILE(DATA_TYPE, 36, 1, out);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ com[0].v = tmp[i * 6 + 2].v - 4.f *tmp[i * 6 + 0].v;
+ com[1].v = tmp[i * 6 + 3].v - 4.f *tmp[i * 6 + 1].v;
+ com[2].v = tmp[i * 6 + 4].v - 4.f *tmp[i * 6 + 2].v;
+ com[3].v = tmp[i * 6 + 5].v - 4.f *tmp[i * 6 + 3].v;
+ com[4].v = tmp[i * 6 + 3].v - tmp[i * 6 + 1].v;
+ com[4].v = com[4].v + com[4].v;
+ com[5].v = tmp[i * 6 + 4].v - tmp[i * 6 + 2].v;
+ out[i * 6 + 0].v = com[2].v - com[0].v;
+ out[i * 6 + 1].v = com[2].v + com[1].v;
+ out[i * 6 + 2].v = com[2].v - com[1].v;
+ out[i * 6 + 3].v = com[5].v + com[4].v;
+ out[i * 6 + 4].v = com[5].v - com[4].v;
+ out[i * 6 + 5].v = com[3].v - com[1].v;
+ })
+
+ // Compute destination address
+ TILE(uint, 36, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 36;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 36, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x5/5x1 or 1x5 and the output tile is 4x4/4x1 or 1x4 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x4_5x5_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+ int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+ int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+ x -= PAD_LEFT;
+ y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 8, 1, in);
+ TILE(DATA_TYPE, 8, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+ TILE(DATA_TYPE, 1, 8, com);
+
+ com[0].s[0] = in[2].v - 4.25f * in[4].v + in[6].v;
+ com[0].s[1] = in[1].v - 4.25f * in[3].v + in[5].v;
+ com[0].s[2] = 0.5f * in[1].v - 2.5f * in[3].v + 2.0f * in[5].v;
+ com[0].s[3] = 0.25f * in[2].v - 1.25f * in[4].v + in[6].v;
+ com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
+ com[0].s[5] = 2.0f * in[1].v - 2.5f * in[3].v + 0.5f * in[5].v;
+ out[0].s[0] = in[0].v - 5.25f * in[2].v + 5.25f * in[4].v - in[6].v;
+ out[1].s[0] = com[0].s[0] + com[0].s[1];
+ out[2].s[0] = com[0].s[0] - com[0].s[1];
+ out[3].s[0] = com[0].s[3] + com[0].s[2];
+ out[4].s[0] = com[0].s[3] - com[0].s[2];
+ out[5].s[0] = com[0].s[4] + com[0].s[5];
+ out[6].s[0] = com[0].s[4] - com[0].s[5];
+ out[7].s[0] = -in[1].v + 5.25f * in[3].v - 5.25f * in[5].v + in[7].v;
+
+ TILE(uint, 8, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 64, 1, in);
+ TILE(DATA_TYPE, 64, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the tile from a NHWC tensor
+ T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+ TILE(DATA_TYPE, 8, 8, com);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[i] = in[2 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
+ com[1].s[i] = in[1 * 8 + i].s[0] - (DATA_TYPE)4.25f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0]; // x
+ com[2].s[i] = (DATA_TYPE)0.25f * in[2 * 8 + i].s[0] - (DATA_TYPE)1.25f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0]; // x
+ com[3].s[i] = (DATA_TYPE)0.5f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0]; // x
+ com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[5].s[i] = (DATA_TYPE)2.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)2.5f * in[3 * 8 + i].s[0] + (DATA_TYPE)0.5f * in[5 * 8 + i].s[0];
+ com[6].s[i] = in[0 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[2 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[4 * 8 + i].s[0] - in[6 * 8 + i].s[0];
+ com[7].s[i] = -in[1 * 8 + i].s[0] + (DATA_TYPE)5.25f * in[3 * 8 + i].s[0] - (DATA_TYPE)5.25f * in[5 * 8 + i].s[0] + in[7 * 8 + i].s[0];
+ })
+
+ TILE(DATA_TYPE, 8, 8, tmp);
+ tmp[0].v = com[6].v;
+ tmp[1].v = com[0].v + com[1].v;
+ tmp[2].v = com[0].v - com[1].v;
+ tmp[3].v = com[2].v + com[3].v;
+ tmp[4].v = com[2].v - com[3].v;
+ tmp[5].v = com[4].v + com[5].v;
+ tmp[6].v = com[4].v - com[5].v;
+ tmp[7].v = com[7].v;
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[0] = tmp[i].s[2] - 4.25f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[1] = tmp[i].s[1] - 4.25f * tmp[i].s[3] + tmp[i].s[5];
+ com[0].s[2] = 0.5f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
+ com[0].s[3] = 0.25f * tmp[i].s[2] - 1.25f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[5] = 2.0f * tmp[i].s[1] - 2.5f * tmp[i].s[3] + 0.5f * tmp[i].s[5];
+ out[i * 8 + 0].s[0] = tmp[i].s[0] - 5.25f * tmp[i].s[2] + 5.25f * tmp[i].s[4] - tmp[i].s[6];
+ out[i * 8 + 1].s[0] = com[0].s[0] + com[0].s[1];
+ out[i * 8 + 2].s[0] = com[0].s[0] - com[0].s[1];
+ out[i * 8 + 3].s[0] = com[0].s[3] + com[0].s[2];
+ out[i * 8 + 4].s[0] = com[0].s[3] - com[0].s[2];
+ out[i * 8 + 5].s[0] = com[0].s[4] + com[0].s[5];
+ out[i * 8 + 6].s[0] = com[0].s[4] - com[0].s[5];
+ out[i * 8 + 7].s[0] = -tmp[i].s[1] + 5.25f * tmp[i].s[3] - 5.25f * tmp[i].s[5] + tmp[i].s[7];
+ })
+
+ TILE(uint, 64, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // !defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x7/7x1/1x7 and the output tile is 2x2/7x1/1x7 when the data layout is NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x2_7x7_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ const int cout = GET_SPATIAL_IDX(0, 1, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // NUM_TILES_X x NUM_TILES_Y
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ // All the tensor dimensions are passed at compile time.
+ // In case of dynamic tensor support, the following dimensions should be passed as function argument.
+#define _ISRC_WIDTH SRC_WIDTH
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+#define _INUM_TILES_Y NUM_TILES_Y
+
+ int x = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+ int y = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+ x -= PAD_LEFT;
+ y -= PAD_TOP;
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 8, 1, in);
+ TILE(DATA_TYPE, 8, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+#if defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 1, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+ T_LOAD_NHWC(DATA_TYPE, 8, 1, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL)
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v *= (DATA_TYPE) - 36.0f;
+ })
+
+ TILE(DATA_TYPE, 1, 8, com) = { { { 0 } } };
+
+ com[0].s[0] = 36.0f * in[2].v - 13.0f * in[4].v + in[6].v;
+ com[0].s[1] = 36.0f * in[1].v - 13.0f * in[3].v + 1.0f * in[5].v;
+ com[0].s[2] = 9.0f * in[2].v - 10.0f * in[4].v + in[6].v;
+ com[0].s[3] = 18.0f * in[1].v - 20.0f * in[3].v + 2.0f * in[5].v;
+ com[0].s[4] = 4.0f * in[2].v - 5.0f * in[4].v + in[6].v;
+ com[0].s[5] = 12.0f * in[1].v - 15.0f * in[3].v + 3.0f * in[5].v;
+ out[0].s[0] = -36.0f * in[0].v + 49.0f * in[2].v + -14.0f * in[4].v + in[6].v;
+ out[1].s[0] = com[0].s[0] - com[0].s[1];
+ out[2].s[0] = com[0].s[0] + com[0].s[1];
+ out[3].s[0] = com[0].s[2] - com[0].s[3];
+ out[4].s[0] = com[0].s[2] + com[0].s[3];
+ out[5].s[0] = com[0].s[4] - com[0].s[5];
+ out[6].s[0] = com[0].s[4] + com[0].s[5];
+ out[7].s[0] = -36.0f * in[1].v + 0.0f * in[2].v + 49.0f * in[3].v - 14.0f * in[5].v + in[7].v;
+
+ TILE(uint, 8, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 8;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 8, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 64, 1, in);
+ TILE(DATA_TYPE, 64, 1, out);
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the tile from a NHWC tensor
+ T_LOAD_NHWC(DATA_TYPE, 8, 8, 1, BUFFER, src, bout, y, x, cout, _ISRC_WIDTH, _ISRC_HEIGHT, src_stride_y, in);
+
+ TILE(DATA_TYPE, 8, 8, com);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[i] = (DATA_TYPE)36.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[1].s[i] = (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)13.0f * in[3 * 8 + i].s[0] + in[5 * 8 + i].s[0];
+ com[2].s[i] = (DATA_TYPE)9.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)10.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[3].s[i] = (DATA_TYPE)18.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)20.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)2.0f * in[5 * 8 + i].s[0];
+ com[4].s[i] = (DATA_TYPE)4.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)5.0f * in[4 * 8 + i].s[0] + in[6 * 8 + i].s[0];
+ com[5].s[i] = (DATA_TYPE)12.0f * in[1 * 8 + i].s[0] - (DATA_TYPE)15.0f * in[3 * 8 + i].s[0] + (DATA_TYPE)3.0f * in[5 * 8 + i].s[0];
+ com[6].s[i] = (DATA_TYPE)49.0f * in[2 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[0 * 8 + i].s[0] + in[6 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[4 * 8 + i].s[0];
+ com[7].s[i] = (DATA_TYPE)49.0f * in[3 * 8 + i].s[0] - (DATA_TYPE)36.0f * in[1 * 8 + i].s[0] + in[7 * 8 + i].s[0] - (DATA_TYPE)14.0f * in[5 * 8 + i].s[0];
+ })
+
+ TILE(DATA_TYPE, 8, 8, tmp);
+ tmp[0].v = com[6].v;
+ tmp[1].v = com[0].v - com[1].v;
+ tmp[2].v = com[0].v + com[1].v;
+ tmp[3].v = com[2].v - com[3].v;
+ tmp[4].v = com[2].v + com[3].v;
+ tmp[5].v = com[4].v - com[5].v;
+ tmp[6].v = com[4].v + com[5].v;
+ tmp[7].v = com[7].v;
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ com[0].s[0] = 36.0f * tmp[i].s[2] - 13.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[1] = 36.0f * tmp[i].s[1] - 13.0f * tmp[i].s[3] + 1.0f * tmp[i].s[5];
+ com[0].s[2] = 9.0f * tmp[i].s[2] - 10.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[3] = 18.0f * tmp[i].s[1] - 20.0f * tmp[i].s[3] + 2.0f * tmp[i].s[5];
+ com[0].s[4] = 4.0f * tmp[i].s[2] - 5.0f * tmp[i].s[4] + tmp[i].s[6];
+ com[0].s[5] = 12.0f * tmp[i].s[1] - 15.0f * tmp[i].s[3] + 3.0f * tmp[i].s[5];
+ out[i * 8 + 0].s[0] = -36.0f * tmp[i].s[0] + 49.0f * tmp[i].s[2] + -14.0f * tmp[i].s[4] + tmp[i].s[6];
+ out[i * 8 + 1].s[0] = com[0].s[0] - com[0].s[1];
+ out[i * 8 + 2].s[0] = com[0].s[0] + com[0].s[1];
+ out[i * 8 + 3].s[0] = com[0].s[2] - com[0].s[3];
+ out[i * 8 + 4].s[0] = com[0].s[2] + com[0].s[3];
+ out[i * 8 + 5].s[0] = com[0].s[4] - com[0].s[5];
+ out[i * 8 + 6].s[0] = com[0].s[4] + com[0].s[5];
+ out[i * 8 + 7].s[0] = -36.0f * tmp[i].s[1] + 0.0f * tmp[i].s[2] + 49.0f * tmp[i].s[3] - 14.0f * tmp[i].s[5] + tmp[i].s[7];
+ })
+
+ TILE(uint, 64, 1, dst_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ dst_indirect_y[i].v = mout + i *_INUM_TILES_X *_INUM_TILES_Y;
+ dst_indirect_y[i].v += bout *_INUM_TILES_X *_INUM_TILES_Y * 64;
+ })
+
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 64, 1, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#endif // defined(WINOGRAD_INPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_INPUT_TRANSFORM_VERTICAL)
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 3x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_3x1_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 5x1 and the output tile is 4x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_4x1_5x1_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 7x1 and the output tile is 2x1 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_2x1_7x1_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x3 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x3_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ winograd_input_transform_4x4_3x3_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x5 and the output tile is 1x4 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x4_1x5_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ winograd_input_transform_4x4_5x5_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes);
+}
+
+//! @cond Doxygen_Suppress
+/** This OpenCL kernel computes the input transform when the kernel size is 1x7 and the output tile is 1x2 for data layout NHWC
+ *
+ * @note Data layout supported: NHWC
+ * @note Data type supported: F32/F16
+ * @note The data type must be passed at compile time using -DDATA_TYPE (e.g. -DDATA_TYPE=half)
+ * @note The number of tiles in the X and Y axes must be passed at compile time using -DNUM_TILES_X and -DNUM_TILES_Y (i.e.-DNUM_TILES_X=5, -DNUM_TILES_Y=3).
+ * @note The convolution padding (left and top) must be passed at compile time using -DPAD_LEFT and -DPAD_TOP (e.g. -DPAD_LEFT=2, -DPAD_TOP=2)
+ * @note The spatial dimensions of the source tensor must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT (e.g. -DSRC_WIDTH=96, -DSRC_HEIGHT=64)
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note If this kernel is used to perform Winograd input transform 3x1, -DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd input transform 1x3, -DWINOGRAD_INPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ *
+ * @param[in] src_ptr Pointer to the source image. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_ptr Pointer to the destination tensor. Supported data types: as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the destination tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+//! @endcond
+__kernel void winograd_input_transform_1x2_1x7_stepz1_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER))
+{
+ winograd_input_transform_2x2_7x7_stepz1_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes);
+}
+#endif // defined(NHWC) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(NUM_TILES_X) && defined(NUM_TILES_Y)
+#endif // defined(NUM_TILES_X) && defined(PAD_LEFT) && defined(PAD_TOP) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
diff --git a/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
new file mode 100644
index 0000000000..0fcd04e713
--- /dev/null
+++ b/src/core/CL/cl_kernels/nhwc/winograd_output_transform.cl
@@ -0,0 +1,1030 @@
+/*
+ * Copyright (c) 2018-2021 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "activation_float_helpers.h"
+#include "helpers.h"
+#include "tile_helpers.h"
+
+#if defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x2/2x1 or 1x2, the filter size 7x7/7x1 or 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 7x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x7, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x2_7x7_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+#define _ISRC_HEIGHT SRC_HEIGHT
+#define _IDST_WIDTH DST_WIDTH
+#define _IDST_HEIGHT DST_HEIGHT
+#define _INUM_TILES_X NUM_TILES_X
+
+ const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+ int x_out = (mout % _INUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / _INUM_TILES_X) * OUTPUT_TILE_H;
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ TILE(DATA_TYPE, 8, N0, in);
+ TILE(DATA_TYPE, 2, N0, out);
+ TILE(uint, 8, 1, src_indirect_y);
+
+ // Calculate the indirect Y for the source tensor
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 8);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 8 channels to compose the 8x1 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // Compute out0 and out01
+ out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v + in[5].v + in[6].v;
+ out[1].v = -in[1].v + in[2].v - 2.f * in[3].v + 2.0f * in[4].v - 3.0f * in[5].v + 3.0f * in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+ // Add bias
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ T_ADD_BROADCAST_X(DATA_TYPE, 2, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+ T_ACTIVATION(DATA_TYPE, 2, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 2, 1, dst_indirect_y);
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, yk, 0, 1, 2,
+ {
+ int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+ dst_indirect_y[yk].v = x_out + y_c * (int)(_IDST_WIDTH);
+ })
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, xk, 0, 1, 2,
+ {
+ int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+ dst_indirect_y[xk].v = x_c + y_out * (int)(_IDST_WIDTH);
+ })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 2, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 64, N0, in);
+ TILE(DATA_TYPE, 4, N0, out);
+ TILE(DATA_TYPE, 16, N0, tmp);
+ TILE(uint, 64, 1, src_indirect_y);
+
+ // Calculate the indirect Y for the source tensor
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ src_indirect_y[i].v = mout + i *_ISRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(_ISRC_HEIGHT * 64);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 64 channels to compose the 8x8 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ tmp[i * 2].v = in[0 + i].v + in[8 + i].v + in[16 + i].v + in[24 + i].v + in[32 + i].v + in[40 + i].v + in[48 + i].v;
+ tmp[i * 2 + 1].v = -in[8 + i].v + in[16 + i].v - 2 * in[24 + i].v + 2 * in[32 + i].v + -3 * in[40 + i].v + 3 * in[48 + i].v + in[56 + i].v;
+ })
+
+ // Compute the 2x2 output tile
+ LOOP_UNROLLING(int, i, 0, 1, 2,
+ {
+ out[i * 2].v = tmp[0 + i].v + tmp[2 + i].v + tmp[4 + i].v + tmp[6 + i].v + tmp[8 + i].v + tmp[10 + i].v + tmp[12 + i].v;
+ out[i * 2 + 1].v = -tmp[2 + i].v + tmp[4 + i].v - 2 * tmp[6 + i].v + 2 * tmp[8 + i].v - 3 * tmp[10 + i].v + 3 * tmp[12 + i].v + tmp[14 + i].v;
+ })
+
+#if defined(HAS_BIAS)
+ // Add bias
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // defined(HAS_BIAS)
+
+ T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 4, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, yk, 0, 1, 2,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, 2,
+ {
+ int x_c = min(x_out + xk, ((int)_IDST_WIDTH - 1));
+ int y_c = min(y_out + yk, ((int)_IDST_HEIGHT - 1));
+ dst_indirect_y[xk + yk * 2].v = x_c + y_c *_IDST_WIDTH;
+ dst_indirect_y[xk + yk * 2].v += bout * (int)(_IDST_WIDTH * _IDST_HEIGHT);
+ })
+ })
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // !defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) && !defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4, 4x1 or 1x4, the filter size 3x3, 3x1 or 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 3x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x3, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ * @param[in] dst_size Size of the destination tensor, minus the last padding
+ */
+__kernel void winograd_output_transform_4x4_3x3_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ TILE(DATA_TYPE, 6, N0, in);
+ TILE(DATA_TYPE, 4, N0, out);
+ TILE(uint, 6, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 6);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 6, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // Compute out00, out01, out02 and out03
+ out[0].v = in[0].v + in[1].v + in[2].v + in[3].v + in[4].v;
+ out[1].v = in[1].v - in[2].v + 2.0f * in[3].v - 2.0f * in[4].v;
+ out[2].v = in[1].v + in[2].v + 4.0f * in[3].v + 4.0f * in[4].v;
+ out[3].v = in[1].v - in[2].v + 8.0f * in[3].v - 8.0f * in[4].v + in[5].v;
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 4, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+ dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+ dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Calculate the indirect Y for the source tensor
+ TILE(DATA_TYPE, 36, N0, in);
+ TILE(DATA_TYPE, 4, N0, tmp);
+ TILE(uint, 36, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 36);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 36,
+ {
+ in[i].v = 0;
+ })
+
+ // Load the values across the 36 channels to compose the 6x6 or 6x1 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 36, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ LOOP_UNROLLING(int, i, 0, 1, 6,
+ {
+ tmp[0].v = in[6 + i].v + in[12 + i].v;
+ tmp[1].v = in[6 + i].v - in[12 + i].v;
+ tmp[2].v = in[18 + i].v + in[24 + i].v;
+ tmp[3].v = in[18 + i].v - in[24 + i].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ in[i].v = in[i].v + tmp[0].v + tmp[2].v;
+ in[6 + i].v = tmp[3].v + tmp[1].v;
+ in[12 + i].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+ in[18 + i].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[30 + i].v;
+ })
+
+ // Compute the output tile
+ TILE(DATA_TYPE, 16, N0, out);
+
+ LOOP_UNROLLING(int, i, 0, 1, 4,
+ {
+ tmp[0].v = in[6 * i + 1].v + in[6 * i + 2].v;
+ tmp[1].v = in[6 * i + 1].v - in[6 * i + 2].v;
+ tmp[2].v = in[6 * i + 3].v + in[6 * i + 4].v;
+ tmp[3].v = in[6 * i + 3].v - in[6 * i + 4].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ out[4 * i + 0].v = in[6 * i + 0].v + tmp[0].v + tmp[2].v;
+ out[4 * i + 1].v = tmp[3].v + tmp[1].v;
+ out[4 * i + 2].v = fma(tmp[2].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[0].v);
+ out[4 * i + 3].v = fma(tmp[3].v, (VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[1].v) + in[6 * i + 5].v;
+ })
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 16, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+ dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+ })
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x4/4x1 or 1x4, the filter size 5x5/5x1 or 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The height of the input tensor must be passed at compile time using -DSRC_HEIGHT: e.g. -DSRC_HEIGHT=32
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note If this kernel is used to perform Winograd output transform 5x1, -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note If this kernel is used to perform Winograd output transform 1x5, -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ * @note The number of output elements processed along the X direction must be passed at compile time using -DN0 e.g. -DN0=1
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x4_5x5_nhwc(
+ TENSOR4D(src, BUFFER),
+ TENSOR4D(dst, BUFFER),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ const int cout = GET_SPATIAL_IDX(0, N0, 0); // OFM
+ const int mout = GET_SPATIAL_IDX(1, 1, 0); // WINOGRAD OUTPUT TILES
+ const int bout = GET_SPATIAL_IDX(2, 1, 0); // BATCH SIZE IDX
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ TILE(DATA_TYPE, 8, N0, in);
+ TILE(DATA_TYPE, 4, N0, out);
+ TILE(DATA_TYPE, 4, N0, tmp);
+ TILE(uint, 8, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 8);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ in[i].v = 0;
+ })
+
+ // "in" contains 1x8 or 8x1 tile here
+ T_LOAD_INDIRECT(DATA_TYPE, 8, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // A^T * in, and in this degenerate case out consists of 1 column/row
+ tmp[0].v = in[1].v - in[2].v;
+ tmp[1].v = 2.0f * (in[3].v - in[4].v);
+ tmp[2].v = 2.0f * (in[5].v + in[6].v);
+ tmp[3].v = in[3].v + in[4].v;
+ out[0].v = in[0].v + in[1].v + in[2].v + tmp[3].v + 4.0f * tmp[2].v;
+ out[1].v = tmp[0].v + tmp[1].v + 4.0f * (in[5].v - in[6].v);
+ out[2].v = in[1].v + in[2].v + 4.0f * tmp[3].v + tmp[2].v;
+ out[3].v = tmp[0].v + 4.0f * tmp[1].v + in[5].v - in[6].v + in[7].v;
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ADD_BROADCAST_X(DATA_TYPE, 4, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 4, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 4, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[yk].v = x_out + y_c *DST_WIDTH;
+ dst_indirect_y[yk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ dst_indirect_y[xk].v = x_c + y_out *DST_WIDTH;
+ dst_indirect_y[xk].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 4, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+
+#else // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+ // Calculate the indirect Y for the source tensor
+ TILE(DATA_TYPE, 64, N0, in);
+ TILE(DATA_TYPE, 6, N0, tmp);
+ TILE(uint, 64, 1, src_indirect_y);
+
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ src_indirect_y[i].v = mout + i *SRC_HEIGHT;
+ src_indirect_y[i].v += bout * (int)(SRC_HEIGHT * 64);
+ })
+
+ // Initialize the input tile
+ LOOP_UNROLLING(int, i, 0, 1, 64,
+ {
+ in[i].v = 0;
+ })
+
+ // "in" here is 8x8 tile
+ T_LOAD_INDIRECT(DATA_TYPE, 64, N0, BUFFER, src, cout, src_stride_y, src_indirect_y, in);
+
+ // A^T * in
+ LOOP_UNROLLING(int, i, 0, 1, 8,
+ {
+ tmp[0].v = in[8 + i].v + in[16 + i].v;
+ tmp[1].v = in[8 + i].v - in[16 + i].v;
+ tmp[2].v = in[24 + i].v + in[32 + i].v;
+ tmp[3].v = in[24 + i].v - in[32 + i].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ tmp[4].v = in[40 + i].v + in[48 + i].v;
+ tmp[4].v = tmp[4].v + tmp[4].v;
+ tmp[5].v = in[40 + i].v - in[48 + i].v;
+
+ // 4x8 matrix as a result
+ in[i].v = in[i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+ in[8 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+ in[16 + i].v = tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[4].v);
+ in[24 + i].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[5].v) + in[56 + i].v;
+ })
+
+ // Compute the output tile
+ TILE(DATA_TYPE, 16, N0, out);
+
+ // in * A, with in = A^T * in as above
+ LOOP_UNROLLING(int, i, 0, 1, 4,
+ {
+ tmp[0].v = in[8 * i + 1].v + in[8 * i + 2].v;
+ tmp[1].v = in[8 * i + 1].v - in[8 * i + 2].v;
+ tmp[2].v = in[8 * i + 3].v + in[8 * i + 4].v;
+ tmp[3].v = in[8 * i + 3].v - in[8 * i + 4].v;
+ tmp[3].v = tmp[3].v + tmp[3].v;
+ tmp[4].v = in[8 * i + 5].v + in[8 * i + 6].v;
+ tmp[4].v = tmp[4].v + tmp[4].v;
+ tmp[5].v = in[8 * i + 5].v - in[8 * i + 6].v;
+
+ // 4x4 tile
+ out[4 * i].v = in[8 * i].v + tmp[0].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[4].v, tmp[2].v);
+ out[4 * i + 1].v = tmp[1].v + fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[5].v, tmp[3].v);
+ out[4 * i + 2].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[2].v, tmp[0].v) + tmp[4].v;
+ out[4 * i + 3].v = fma((VEC_DATA_TYPE(DATA_TYPE, N0))4.0f, tmp[3].v, tmp[1].v) + tmp[5].v + in[8 * i + 7].v;
+ })
+
+#if defined(HAS_BIAS)
+ TILE(DATA_TYPE, 1, N0, b);
+
+ T_LOAD(DATA_TYPE, 1, N0, BUFFER, bias, cout, 0, 1, 0, b);
+
+ // c = c + bias[broadcasted]
+ T_ADD_BROADCAST_X(DATA_TYPE, 16, N0, out, b, out);
+#endif // HAS_BIAS
+
+ int x_out = (mout % NUM_TILES_X) * OUTPUT_TILE_W;
+ int y_out = (mout / NUM_TILES_X) * OUTPUT_TILE_H;
+
+ T_ACTIVATION(DATA_TYPE, 16, N0, ACTIVATION_TYPE, A_VAL, B_VAL, out, out);
+
+ TILE(uint, 16, 1, dst_indirect_y);
+
+ // Calculate the destination indirect Y
+ LOOP_UNROLLING(int, yk, 0, 1, 4,
+ {
+ LOOP_UNROLLING(int, xk, 0, 1, 4,
+ {
+ int x_c = min(x_out + xk, ((int)DST_WIDTH - 1));
+ int y_c = min(y_out + yk, ((int)DST_HEIGHT - 1));
+ dst_indirect_y[xk + yk * 4].v = x_c + y_c *DST_WIDTH;
+ dst_indirect_y[xk + yk * 4].v += bout * (int)(DST_WIDTH * DST_HEIGHT);
+ })
+ })
+
+ // Store the tile in reverse order so the invalid values are overwritten with the valid ones
+ T_STORE_INDIRECT_WIDTH_SELECT(DATA_TYPE, 16, N0, 0, BUFFER, dst, cout, dst_stride_y, false, out, dst_indirect_y);
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL) || defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 2x1, the filter size 7x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=2
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_2x1_7x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 3x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_3x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 4x1, the filter size 5x1 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=4
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=1
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_4x1_5x1_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL)
+
+#if defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#if defined(VEC_SIZE) && VEC_SIZE == 2
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x2, the filter size 1x7 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=2
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x2_1x7_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_2x2_7x7_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 2
+
+#if defined(VEC_SIZE) && VEC_SIZE == 4
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x3 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x3_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_4x4_3x3_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
+
+/** This OpenCL kernel performs Winograd output transform when the output tile is 1x4, the filter size 1x5 and the data layout is NHWC
+ *
+ * @note The number of tiles along the X direction must be passed at compile time using -DNUM_TILES_X: e.g. -DNUM_TILES_X=16
+ * @note The width of the output tile must be passed at compile time using -DOUTPUT_TILE_W: e.g. -DOUTPUT_TILE_W=1
+ * @note The height of the output tile must be passed at compile time using -DOUTPUT_TILE_H: e.g. -DOUTPUT_TILE_H=4
+ * @note The width of the output tensor must be passed at compile time using -DDST_WIDTH: e.g. -DDST_WIDTH=24
+ * @note The height of the output tensor must be passed at compile time using -DDST_HEIGHT: e.g. -DDST_HEIGHT=32
+ * @note -DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL has to be passed at compile time
+ * @note The data type must be passed at compile time using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types: float/half.
+ *
+ * @param[in] src_ptr Pointer to the source tensor. Supported data types: F32/F16
+ * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] src_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] src_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] src_step_z src_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] src_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] src_step_w src_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor
+ * @param[out] dst_ptr Pointer to the destination tensor. Supported data types: same as @p src_ptr
+ * @param[in] dst_stride_x Stride of the destination tensor in X dimension (in bytes)
+ * @param[in] dst_step_x dst_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] dst_stride_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] dst_step_y dst_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] dst_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] dst_step_z dst_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] dst_stride_w Stride of the source tensor in W dimension (in bytes)
+ * @param[in] dst_step_w dst_stride_w * number of elements along W processed per workitem(in bytes)
+ * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor
+ */
+__kernel void winograd_output_transform_1x4_1x5_nhwc(
+ TENSOR4D_DECLARATION(src),
+ TENSOR4D_DECLARATION(dst),
+#if defined(HAS_BIAS)
+ VECTOR_DECLARATION(bias),
+#endif // defined(HAS_BIAS)
+ int dst_size)
+{
+ winograd_output_transform_4x4_5x5_nhwc(src_ptr,
+ src_stride_x,
+ src_step_x,
+ src_stride_y,
+ src_step_y,
+ src_stride_z,
+ src_step_z,
+ src_stride_w,
+ src_step_w,
+ src_offset_first_element_in_bytes,
+ dst_ptr,
+ dst_stride_x,
+ dst_step_x,
+ dst_stride_y,
+ dst_step_y,
+ dst_stride_z,
+ dst_step_z,
+ dst_stride_w,
+ dst_step_w,
+ dst_offset_first_element_in_bytes,
+#if defined(HAS_BIAS)
+ bias_ptr,
+ bias_stride_x,
+ bias_step_x,
+ bias_offset_first_element_in_bytes,
+#endif // defined(HAS_BIAS)
+ dst_size);
+}
+#endif // defined(VEC_SIZE) && VEC_SIZE == 4
+#endif // defined(WINOGRAD_OUTPUT_TRANSFORM_VERTICAL)
+#endif // defined(NUM_TILES_X) && defined(OUTPUT_TILE_W) && defined(OUTPUT_TILE_H) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/pooling_layer.cl b/src/core/CL/cl_kernels/pooling_layer.cl
deleted file mode 100644
index d63a2e51e8..0000000000
--- a/src/core/CL/cl_kernels/pooling_layer.cl
+++ /dev/null
@@ -1,981 +0,0 @@
-/*
- * Copyright (c) 2017-2021 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-#include "repeat.h"
-#include "tile_helpers.h"
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#define POOL_OP(x, y) ((x) + (y))
-#else /* defined(POOL_AVG) || defined(POOL_L2) */
-#define POOL_OP(x, y) (fmax((x), (y)))
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
-#define POW2_OP(x, vec_size) ((x) * (x))
-#else /* defined(POOL_L2) */
-#define POW2_OP(x, vec_size) (x)
-#endif /* defined(POOL_L2) */
-
-#define DIV_OP(x, y) (x * (1.f / y))
-#define SQRT_OP(x) sqrt((x))
-
-#if STRIDE_X == 1
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE1(res, input, output)
-#elif STRIDE_X == 2 /* STRIDE_X == 1 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE2(res, input, output)
-#elif STRIDE_X == 3 /* STRIDE_X not equals 1 or 2 */
-#define POOLING3x3(res, input, output) POOLING3x3_STRIDE3(res, input, output)
-#endif /* STRIDE_X == 3 */
-
-#if defined(FP_MIXED_PRECISION)
-#define CONVERT_TO_ACC_DATA_TYPE(x, n) CONVERT(x, VEC_DATA_TYPE(ACC_DATA_TYPE, n))
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) \
- CONVERT_TO_ACC_DATA_TYPE(vload##n(offset, ptr), n)
-#else /* defined(FP_MIXED_PRECISION) */
-#define VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(n, offset, ptr) vload##n(offset, ptr)
-#endif /* defined(FP_MIXED_PRECISION) */
-
-#define POOLING3x3_STRIDE1(res, input, output) \
- ({ \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
- data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 4); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
- data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 4); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 2) \
- data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 4); \
- data00 = POW2_OP(data00, 4); \
- data01 = POW2_OP(data01, 2); \
- data10 = POW2_OP(data10, 4); \
- data11 = POW2_OP(data11, 2); \
- data20 = POW2_OP(data20, 4); \
- data21 = POW2_OP(data21, 2); \
- \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01212323); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data01.s0, data00.s3, data01.s01); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01212323); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data11.s0, data10.s3, data11.s01); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01212323); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data21.s0, data20.s3, data21.s01); \
- \
- values00 = POOL_OP(values00, values10); \
- values01 = POOL_OP(values01, values11); \
- values00 = POOL_OP(values00, values20); \
- values01 = POOL_OP(values01, values21); \
- \
- res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
- res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \
- })
-
-#define POOLING3x3_STRIDE2(res, input, output) \
- ({ \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
- ACC_DATA_TYPE data01 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
- ACC_DATA_TYPE data11 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
- ACC_DATA_TYPE data21 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8)); \
- data00 = POW2_OP(data00, 8); \
- data01 = POW2_OP(data01, 1); \
- data10 = POW2_OP(data10, 8); \
- data11 = POW2_OP(data11, 1); \
- data20 = POW2_OP(data20, 8); \
- data21 = POW2_OP(data21, 1); \
- \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- values00 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data00.s01223445); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- values01 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s667, data01); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- values10 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data10.s01223445); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- values11 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data10.s667, data11); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- values20 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 8))(data20.s01223445); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- values21 = (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data20.s667, data21); \
- \
- values00 = POOL_OP(values00, values10); \
- values01 = POOL_OP(values01, values11); \
- values00 = POOL_OP(values00, values20); \
- values01 = POOL_OP(values01, values21); \
- \
- res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s036, values01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s147, values01.s2)); \
- res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(values00.s25, values01.s03)); \
- })
-
-#define POOLING3x3_STRIDE3(res, input, output) \
- ({ \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- data00 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- data01 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0) + 8); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- data10 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- data11 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0) + 8); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8) \
- data20 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0)); \
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4) \
- data21 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(4, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0) + 8); \
- data00 = POW2_OP(data00, 8); \
- data01 = POW2_OP(data01, 4); \
- data10 = POW2_OP(data10, 8); \
- data11 = POW2_OP(data11, 4); \
- data20 = POW2_OP(data20, 8); \
- data21 = POW2_OP(data21, 4); \
- \
- data00 = POOL_OP(data00, data10); \
- data01 = POOL_OP(data01, data11); \
- data00 = POOL_OP(data00, data20); \
- data01 = POOL_OP(data01, data21); \
- \
- res = POOL_OP((VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s036, data01.s1), (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s147, data01.s2)); \
- res = POOL_OP(res, (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(data00.s25, data01.s03)); \
- })
-
-ACC_DATA_TYPE calculate_avg_scale(const int pool_size_x, const int pool_size_y, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
- int start_x = get_global_id(0) * stride_x - pad_x;
- int start_y = get_global_id(1) * stride_y - pad_y;
- const int end_x = min(start_x + pool_size_x, upper_bound_w);
- const int end_y = min(start_y + pool_size_y, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
- start_x = max(0, start_x);
- start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
- return ((end_y - start_y) * (end_x - start_x));
-}
-
-/** Performs a pooling function of pool size equal to 2.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_2(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- // Load data
- VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
- data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
- VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
- data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(2, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
-
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 = POW2_OP(data0, 2);
- data1 = POW2_OP(data1, 2);
-#endif /* defined(POOL_L2) */
-
- // Perform calculations
- data0 = POOL_OP(data0, data1);
- ACC_DATA_TYPE res = POOL_OP(data0.s0, data0.s1);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
- // Divide by pool region in case of average or l2 pooling
- res = DIV_OP(res, calculate_avg_scale(2, 2, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
- // Store result
- *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-/** Performs a pooling function of pool size equal to 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_3(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- // Load data
- VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
- data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0));
- VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
- data1 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 1, 0));
- VEC_DATA_TYPE(ACC_DATA_TYPE, 3)
- data2 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(3, 0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 2, 0));
-
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 = POW2_OP(data0, 3);
- data1 = POW2_OP(data1, 3);
- data2 = POW2_OP(data2, 3);
-#endif /* defined(POOL_L2) */
-
- // Perform calculations
- data0 = POOL_OP(data0, data1);
- data0 = POOL_OP(data0, data2);
- ACC_DATA_TYPE res = POOL_OP(POOL_OP(data0.s0, data0.s1), data0.s2);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
- // Divide by pool region in case of average pooling
- res = DIV_OP(res, calculate_avg_scale(3, 3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
- // Store result
- *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-
-#if defined(POOLING3x3)
-
-#define CONVERT_OP(data_type) convert_##data_type##4
-#define CONVERT_VECTOR4(data_type) CONVERT_OP(data_type)
-
-VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
-calculate_avg_scale4(const int pool_size, const int upper_bound_w, const int upper_bound_h,
- const int pad_x, const int pad_y, const int stride_x, const int stride_y)
-{
- int4 start_x = ((int4)get_global_id(0) * 4 + (int4)(0, 1, 2, 3)) * (int4)stride_x - (int4)pad_x;
- int start_y = get_global_id(1) * stride_y - pad_y;
- const int4 end_x = min(start_x + (int4)pool_size, (int4)upper_bound_w);
- const int end_y = min(start_y + pool_size, upper_bound_h);
-#if defined(EXCLUDE_PADDING)
- start_x = max((int4)0, start_x);
- start_y = max(0, start_y);
-#endif /* defined(EXCLUDE_PADDING) */
- return (VEC_DATA_TYPE(ACC_DATA_TYPE, 4))(1.f) / CONVERT_VECTOR4(ACC_DATA_TYPE)(((int4)(end_y - start_y)) * (end_x - start_x));
-}
-
-/** Performs an optimized pooling function of pool size equal to 3 when the stride_x is less equal than 3
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG or -DPOOL_L2 must be provided otherwise max pooling will be performed.
- * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_optimized_3(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
- res;
-
- // Perform pooling 3x3 for 4 output elements
- POOLING3x3(res, input, output);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
- // Divide by pool region in case of average pooling
- res *= calculate_avg_scale4(3, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y);
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
- vstore4(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 4)), 0, (__global DATA_TYPE *)output.ptr);
-}
-#endif // defined(POOLING3x3)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-/** Performs a pooling function of pool size equal to N (NCHW)
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=float. Supported data types are F16/F32;
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note In case of average pooling the following information must be passed at compile time:
- * -DPOOL_AVG must be provided otherwise max pooling will be performed.
- * -DMAX_WIDTH and -DMAX_HEIGHT which are the maximum accessible indeces in x and y dimensions (width + pad)
- * -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * -DPAD_X and -DPAD_Y which are the pooling paddings in x and y dimension
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16/F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nchw(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
-
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
- vdata = INITIAL_VALUE;
- ACC_DATA_TYPE sdata = INITIAL_VALUE;
-
- // Load data
- for(int y = 0; y < POOL_SIZE_Y; y++)
- {
- int x = 0;
- for(; x <= ((int)POOL_SIZE_X - 8); x += 8)
- {
- VEC_DATA_TYPE(ACC_DATA_TYPE, 8)
- data0 = VLOAD_AND_CONVERT_TO_ACC_DATA_TYPE(8, 0, (__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0));
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 *= data0;
-#endif /* defined(POOL_L2) */
- vdata = POOL_OP(vdata, data0);
- }
-
- // Leftover
- for(; x < (int)POOL_SIZE_X; ++x)
- {
- ACC_DATA_TYPE data0 = (ACC_DATA_TYPE)(*((__global DATA_TYPE *)tensor3D_offset(&input, x, y, 0)));
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 *= data0;
-#endif /* defined(POOL_L2) */
- sdata = POOL_OP(sdata, data0);
- }
- }
-
- // Reduce result
- VEC_DATA_TYPE(ACC_DATA_TYPE, 4)
- reduce4 = POOL_OP(vdata.s0123, vdata.s4567);
- VEC_DATA_TYPE(ACC_DATA_TYPE, 2)
- reduce2 = POOL_OP(reduce4.s01, reduce4.s23);
- ACC_DATA_TYPE res = POOL_OP(reduce2.s0, reduce2.s1);
- res = POOL_OP(res, sdata);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
- // Divide by pool region in case of average pooling
- res = DIV_OP(res, calculate_avg_scale(POOL_SIZE_X, POOL_SIZE_Y, MAX_WIDTH, MAX_HEIGHT, PAD_X, PAD_Y, STRIDE_X, STRIDE_Y));
-#endif /* defined(POOL_AVG) || defined(POOL_L2) */
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res = SQRT_OP(res);
-#endif /* defined(POOL_L2) */
-
- // Store result
- *(__global DATA_TYPE *)output.ptr = (DATA_TYPE)res;
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-inline void offset_no_padding_nchw(const Tensor3D *input, uint *offset_top, uint *offset_bottom)
-{
- const int pad_horiz = PAD_TENSOR_LEFT + PAD_TENSOR_RIGHT;
- const int pad_vert = PAD_TENSOR_TOP + PAD_TENSOR_BOTTOM;
-
- const int x = get_global_id(0) * STRIDE_X;
- const int y = get_global_id(1) * STRIDE_Y;
- const int z = get_global_id(2);
-
- //x axis: width, y axis: height, z axis: component
- const uint padded_offset = input->offset_first_element_in_bytes
- + x * input->stride_x
- + y * input->stride_y
- + z * input->stride_z;
-
- const uint offset_base = padded_offset
- - y * pad_horiz * sizeof(DATA_TYPE) /* Horizontal padding for each row */
- - PAD_TENSOR_TOP * input->stride_y /* top padding */
- - z * MAX_HEIGHT * pad_horiz * sizeof(DATA_TYPE) - z * pad_vert * input->stride_y /* Z plane padding */
- - PAD_TENSOR_LEFT * sizeof(DATA_TYPE);
-
-#if defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT)
- *offset_top = (uint)((offset_base / sizeof(DATA_TYPE)) % (TENSOR_CHANNEL * TENSOR_WIDTH * TENSOR_HEIGHT));
-#else /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
- *offset_top = (uint)(offset_base / sizeof(DATA_TYPE));
-#endif /* defined(TENSOR_CHANNEL) && defined(TENSOR_WIDTH) && defined(TENSOR_HEIGHT) */
-
- *offset_bottom = *offset_top + input->stride_y / sizeof(DATA_TYPE) - pad_horiz;
-
- return;
-}
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
- * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
- * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
- * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
- * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp32(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- TENSOR3D_DECLARATION(indices))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
- // Load data
- float2 data0 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 0, 0));
- float2 data1 = VLOAD(2)(0, (__global float *)tensor3D_offset(&input, 0, 1, 0));
-
- // Perform calculations
- float data0_max = POOL_OP(data0.s0, data0.s1);
- float data1_max = POOL_OP(data1.s0, data1.s1);
- float res = POOL_OP(data0_max, data1_max);
- // Store result
- *(__global float *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
- uint offset_top = 0;
- uint offset_bottom = 0;
-
- offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
- uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
- uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
- uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
- *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-/** Performs a MAX pooling of pool size equal to 2, and record max value indices for NCHW.
- *
- * @note Datatype must be passed using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F16
- * @note Pool sizes must be passed using -DPOOL_SIZE_X and -DPOOL_SIZE_Y e.g. -DPOOL_SIZE_X=13;
- * @note Tensors width and height must be passed at compile time using -DMAX_WIDTH and -DMAX_HEIGHT
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Tensor padding values must be passed at compile time using PAD_TENSOR_LEFT, PAD_TENSOR_RIGHT, PAD_TENSOR_TOP and PAD_TENSOR_BOTTOM
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F16
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] indices_ptr Pointer to the indices tensor. Supported data types: U32
- * @param[in] indices_stride_x Stride of the indices tensor in X dimension (in bytes)
- * @param[in] indices_step_x indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] indices_stride_y Stride of the indices tensor in Y dimension (in bytes)
- * @param[in] indices_step_y indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] indices_stride_z Stride of the indices tensor in Z dimension (in bytes)
- * @param[in] indices_step_z indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] indices_offset_first_element_in_bytes The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2_nchw_indices_fp16(
- TENSOR3D_DECLARATION(input),
- TENSOR3D_DECLARATION(output),
- TENSOR3D_DECLARATION(indices))
-{
- // Get pixels pointer
- Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input);
- Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output);
- Tensor3D indices = CONVERT_TO_TENSOR3D_STRUCT(indices);
-
- // Load data
- half2 data0 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 0, 0));
- half2 data1 = VLOAD(2)(0, (__global half *)tensor3D_offset(&input, 0, 1, 0));
-
- // Perform calculations
- half data0_max = POOL_OP(data0.s0, data0.s1);
- half data1_max = POOL_OP(data1.s0, data1.s1);
- half res = POOL_OP(data0_max, data1_max);
- // Store result
- *(__global half *)output.ptr = res;
-
-#if defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-
- uint offset_top = 0;
- uint offset_bottom = 0;
-
- offset_no_padding_nchw(&input, &offset_top, &offset_bottom);
-
- uint index0 = select(offset_top + 1, offset_top, isgreaterequal(data0.s0, data0.s1));
- uint index1 = select(offset_bottom + 1, offset_bottom, isgreaterequal(data1.s0, data1.s1));
- uint index = select(index1, index0, isgreaterequal(data0_max, data1_max));
-
- *(__global uint *)indices.ptr = index;
-
-#endif //defined(PAD_TENSOR_LEFT) && defined(PAD_TENSOR_RIGHT) && defined(PAD_TENSOR_TOP) && defined(PAD_TENSOR_BOTTOM)
-}
-
-#if defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE)
-
-#if defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-/** Performs pooling layer of size equal to MxN. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Pool size must be passed at compile time using -DPOOL_SIZE_X and -DPOOL_SIZE_Y. e.g. -DPOOL_SIZE_X=4, -DPOOL_SIZE_Y=4
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- */
-__kernel void pooling_layer_MxN_nhwc(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output))
-{
- // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
- // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
- int idx_out_c = GET_SPATIAL_IDX(0, VEC_SIZE, VEC_SIZE_LEFTOVER);
- int idx_out_w = GET_SPATIAL_IDX(1, 1, 0);
-#if DST_BATCH_SIZE != 1
- // If batch size != 1, the batch size dimension is collapsed over the height dimension
- int idx_out_h = GET_SPATIAL_IDX(2, 1, 0) % DST_HEIGHT;
- int idx_out_n = GET_SPATIAL_IDX(2, 1, 0) / DST_HEIGHT;
-#else //DST_BATCH_SIZE != 1
- int idx_out_h = GET_SPATIAL_IDX(2, 1, 0);
- int idx_out_n = 0;
-#endif // DST_BATCH_SIZE != 1
-
- __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
- __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
- output_stride_w;
-
- VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
- res0 = INITIAL_VALUE;
-
- int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
- int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
- int pool_x_s = max((int)0, -idx_in_w);
- int pool_x_e = min((int)POOL_SIZE_X, (int)SRC_WIDTH - idx_in_w);
- int pool_y_s = max((int)0, -idx_in_h);
- int pool_y_e = min((int)POOL_SIZE_Y, (int)SRC_HEIGHT - idx_in_h);
-
-#if defined(EXCLUDE_PADDING)
- int filter_size = (pool_y_e - pool_y_s) * (pool_x_e - pool_x_s);
-#else // defined(EXCLUDE_PADDING)
- int filter_size = POOL_SIZE_X * POOL_SIZE_Y;
-#endif // defined(EXCLUDE_PADDING)
-
-#if POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
- // Global pooling path
- for(int y = 0; y < POOL_SIZE_Y; ++y)
- {
-#pragma unroll 8
- for(int x = 0; x < POOL_SIZE_X; ++x)
- {
-#else // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
- for(int y = pool_y_s; y < pool_y_e; ++y)
- {
-#pragma unroll 8
- for(int x = pool_x_s; x < pool_x_e; ++x)
- {
-#endif // POOL_SIZE_X == SRC_WIDTH && POOL_SIZE_Y == SRC_HEIGHT && PAD_X == 0 && PAD_Y == 0
- VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
- data0;
-#if defined(FP_MIXED_PRECISION)
- // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
- data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else // defined(FP_MIXED_PRECISION)
- data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + (x + idx_in_w) * input_stride_y + (y + idx_in_h) * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 *= data0;
-#endif // defined(POOL_L2)
- res0 = POOL_OP(res0, data0);
- }
- }
-
-#if defined(POOL_AVG) || defined(POOL_L2)
- res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
- // Store result
-#if defined(FP_MIXED_PRECISION)
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
- STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else // defined(FP_MIXED_PRECISION)
- STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-}
-#endif // defined(POOL_SIZE_X) && defined(POOL_SIZE_Y)
-
-#define SELECT_TYPE SELECT_VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
-
-/** Performs pooling layer of size equal to 2. This OpenCL kernel can perform the following pooling types:
- * -# max, -DPOOL_MAX must be passed at compile time
- * -# max extracting the max index, -DPOOL_MAX and -DEXTRACT_MAX_INDEX must be passed at compile time
- * -# average, -DPOOL_AVG must be passed at compile time. If padding has to be expluded, -DEXCLUDE_PADDING should be passed at compile time
- * -# l2 normalisation, -DPOOL_L2 must be passed at compile time
- *
- * @note Datatype must be passed at compile type using -DDATA_TYPE e.g. -DDATA_TYPE=half. Supported data types are F32/F16
- * @note Accumulation data type must be passed at compile time using -DACC_DATA_TYPE e.g. -DACC_DATA_TYPE=float
- * @note If -DFP_MIXED_PRECISION is passed at compile time, the kernel will use F32 for the partial result
- * @note Input tensor width and height must be passed at compile time using -DSRC_WIDTH and -DSRC_HEIGHT
- * @note Output tensor height, channels and batch size must be passed at compile time using -DDST_HEIGHT, -DDST_CHANNELS and -DDST_BATCH_SIZE
- * @note Pool strides must be passed at compile time using -DSTRIDE_X and -DSTRIDE_Y which are the steps of the window along the x and y directions
- * @note Pool pads must be passed at compile time using -DPAD_X and -DPAD_Y
- * @note Vector size must be passed at compile time using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16
- * @note Leftover vector size must be passed at compile time using -DVEC_SIZE_LEFTOVER. e.g. -DVEC_SIZE_LEFTOVER=3. It is defined as the remainder between the input's first dimension and VEC_SIZE
- * @note The initial value for the pooling operation must be passed at compile time using -DINITIAL_VALUE e.g. -DINITIAL_VALUE=0
- *
- * @param[in] input_ptr Pointer to the source tensor. Supported data types: F32/F16
- * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes)
- * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes)
- * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes)
- * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] input_stride_w Stride of the source tensor in W dimension (in bytes)
- * @param[in] input_step_w input_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor
- * @param[out] output_ptr Pointer to the destination tensor. Supported data types: same as @p input_ptr
- * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes)
- * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes)
- * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] output_stride_z Stride of the destination tensor in Z dimension (in bytes)
- * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] output_stride_w Stride of the destination tensor in W dimension (in bytes)
- * @param[in] output_step_w output_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor
- * @param[in] indices_ptr (Optional) Pointer to the indices tensor. Supported data types: U32
- * @param[in] indices_stride_x (Optional) Stride of the indices tensor in X dimension (in bytes)
- * @param[in] indices_step_x (Optional) indices_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] indices_stride_y (Optional) Stride of the indices tensor in Y dimension (in bytes)
- * @param[in] indices_step_y (Optional) indices_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] indices_stride_z (Optional) Stride of the indices tensor in Z dimension (in bytes)
- * @param[in] indices_step_z (Optional) indices_stride_z * number of elements along Z processed per workitem(in bytes)
- * @param[in] indices_stride_w (Optional) Stride of the indices tensor in W dimension (in bytes)
- * @param[in] indices_step_w (Optional) indices_stride_w * number of elements along W processed per workitem(in bytes)
- * @param[in] indices_offset_first_element_in_bytes (Optional) The offset of the first element in the indices tensor
- */
-__kernel void pooling_layer_2x2_nhwc(
- TENSOR4D_DECLARATION(input),
- TENSOR4D_DECLARATION(output)
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
- ,
- TENSOR4D_DECLARATION(indices)
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-)
-{
- // Note: If C is not multiple of VEC_SIZE, we shift back of VEC_SIZE_LEFTOVER elements to compute the leftover elements for get_global_id(0) == 0
- // Note: If C is less than VEC_SIZE, VEC_SIZE should be SHRINKED to the closest smaller VEC_SIZE. This operation is performed on the host side
- int idx_out_c = max((int)(get_global_id(0) * VEC_SIZE - (VEC_SIZE - VEC_SIZE_LEFTOVER) % VEC_SIZE), 0);
- int idx_out_w = get_global_id(1);
-#if DST_BATCH_SIZE != 1
- // If batch size != 1, the batch size dimension is collapsed over the height dimension
- int idx_out_h = get_global_id(2) % DST_HEIGHT;
- int idx_out_n = get_global_id(2) / DST_HEIGHT;
-#else //SRC_BATCH_SIZE != 1
- int idx_out_h = get_global_id(2);
- int idx_out_n = 0;
-#endif // SRC_BATCH_SIZE != 1
-
- int idx_in_w = idx_out_w * STRIDE_X - PAD_X;
- int idx_in_h = idx_out_h * STRIDE_Y - PAD_Y;
-
- __global unsigned char *in_base_ptr = input_ptr + input_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_n * input_stride_w;
-
- __global unsigned char *out_base_ptr = output_ptr + output_offset_first_element_in_bytes + idx_out_c * sizeof(DATA_TYPE) + idx_out_w * output_stride_y + idx_out_h * output_stride_z + idx_out_n *
- output_stride_w;
-
- int pool_x_s = max((int)0, -idx_in_w);
- int pool_x_e = min((int)2, (int)SRC_WIDTH - idx_in_w);
- int pool_y_s = max((int)0, -idx_in_h);
- int pool_y_e = min((int)2, (int)SRC_HEIGHT - idx_in_h);
-
- int filter_size = (pool_x_e - pool_x_s) * (pool_y_e - pool_y_s);
-
- int x0 = pool_x_s + idx_in_w;
- int y0 = pool_y_s + idx_in_h;
- int x1 = pool_x_e - 1 + idx_in_w;
- int y1 = pool_y_e - 1 + idx_in_h;
-
- REPEAT_VAR_INIT_TO_CONST(4, VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE), data, 0);
-
-#if defined(FP_MIXED_PRECISION)
- // In case of FP_MIXED_PRECISION, ACC_DATA_TYPE is != DATA_TYPE
- data0 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
- data1 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
- data2 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
- data3 = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z)), VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE));
-#else // defined(FP_MIXED_PRECISION)
- data0 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y0 * input_stride_z));
- data1 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y0 * input_stride_z));
- data2 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x0 * input_stride_y + y1 * input_stride_z));
- data3 = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)(in_base_ptr + x1 * input_stride_y + y1 * input_stride_z));
-#endif // defined(FP_MIXED_PRECISION)
-
-#if !defined(POOL_MAX)
- if(filter_size != 4)
- {
- SELECT_TYPE cond_w_s = (SELECT_TYPE)idx_in_w < (SELECT_TYPE)0;
- SELECT_TYPE cond_w_e = (SELECT_TYPE)idx_in_w >= (SELECT_TYPE)(SRC_WIDTH - 1);
- SELECT_TYPE cond_h_s = (SELECT_TYPE)idx_in_h < (SELECT_TYPE)0;
- SELECT_TYPE cond_h_e = (SELECT_TYPE)idx_in_h >= (SELECT_TYPE)(SRC_HEIGHT - 1);
-
- // Make invalid the values loaded if the x or y coordinate was clamped (out-of-bound)
- data0 = select(data0, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_s));
- data1 = select(data1, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_s));
- data2 = select(data2, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_s | cond_h_e));
- data3 = select(data3, (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))INITIAL_VALUE, (SELECT_TYPE)(cond_w_e | cond_h_e));
- }
-#endif // !defined(POOL_MAX)
-
-#if defined(POOL_L2)
- // Raise to power of 2 for L2 Pooling
- data0 *= data0;
- data1 *= data1;
- data2 *= data2;
- data3 *= data3;
-#endif /* defined(POOL_L2) */
-
- VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE)
- res0 = data0;
- res0 = POOL_OP(res0, data1);
- res0 = POOL_OP(res0, data2);
- res0 = POOL_OP(res0, data3);
-
-#if defined(POOL_AVG) || defined(POOL_L2)
-#if defined(EXCLUDE_PADDING)
- res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))filter_size;
-#else // !defined(EXCLUDE_PADDING)
- res0 /= (VEC_DATA_TYPE(ACC_DATA_TYPE, VEC_SIZE))4;
-#endif // defined(EXCLUDE_PADDING)
-#endif // defined(POOL_AVG) || defined(POOL_L2)
-
-#if defined(POOL_L2)
- // Take square root of the result in L2 pooling
- res0 = SQRT_OP(res0);
-#endif // defined(POOL_L2)
-
- // Store result
-#if defined(FP_MIXED_PRECISION)
- VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
- res_converted0 = CONVERT(res0, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE));
- STORE_VECTOR_SELECT(res_converted, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#else // defined(FP_MIXED_PRECISION)
- STORE_VECTOR_SELECT(res, DATA_TYPE, out_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, (VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0);
-#endif // defined(FP_MIXED_PRECISION)
-
-#if defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-
- // This part is used to return the index of the maximum value
- // Note: DST_CHANNELS and DST_BATCH_SIZE can be used for either the input and output tensor
-
- // note: Batch dimension does not contribute in the offset contribution
- VEC_DATA_TYPE(uint, VEC_SIZE)
- base_index = (uint)idx_out_c;
-
- base_index += VEC_OFFS(uint, VEC_SIZE);
-
- VEC_DATA_TYPE(uint, VEC_SIZE)
- index0 = base_index + (uint)x0 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
- VEC_DATA_TYPE(uint, VEC_SIZE)
- index1 = base_index + (uint)x1 * DST_CHANNELS + (uint)y0 * (DST_CHANNELS * SRC_WIDTH);
- VEC_DATA_TYPE(uint, VEC_SIZE)
- index2 = base_index + (uint)x0 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
- VEC_DATA_TYPE(uint, VEC_SIZE)
- index3 = base_index + (uint)x1 * DST_CHANNELS + (uint)y1 * (DST_CHANNELS * SRC_WIDTH);
-
- index0 = select(index1, index0, CONVERT(isgreaterequal(data0, data1), VEC_DATA_TYPE(int, VEC_SIZE)));
- index1 = select(index3, index2, CONVERT(isgreaterequal(data2, data3), VEC_DATA_TYPE(int, VEC_SIZE)));
- index0 = select(index1, index0, CONVERT(isgreaterequal(max(data0, data1), max(data2, data3)), VEC_DATA_TYPE(int, VEC_SIZE)));
-
- __global unsigned char *idx_base_ptr = indices_ptr + indices_offset_first_element_in_bytes + idx_out_c * sizeof(uint) + idx_out_w * indices_stride_y + idx_out_h * indices_stride_z + idx_out_n *
- indices_stride_w;
-
- // Store result
- STORE_VECTOR_SELECT(index, uint, idx_base_ptr, VEC_SIZE, VEC_SIZE_LEFTOVER, ((VEC_SIZE_LEFTOVER != 0) && get_global_id(0) == 0));
-#endif // defined(EXTRACT_MAX_INDEX) && defined(POOL_MAX)
-}
-#endif // defined(VEC_SIZE) && defined(VEC_SIZE_LEFTOVER) && defined(SRC_WIDTH) && defined(SRC_HEIGHT) && defined(DST_CHANNELS) && defined(DST_HEIGHT) && defined(DST_BATCH_SIZE) && defined(ACC_DATA_TYPE) \ No newline at end of file
diff --git a/src/core/CL/cl_kernels/sobel_filter.cl b/src/core/CL/cl_kernels/sobel_filter.cl
deleted file mode 100644
index 7983734fc4..0000000000
--- a/src/core/CL/cl_kernels/sobel_filter.cl
+++ /dev/null
@@ -1,541 +0,0 @@
-/*
- * Copyright (c) 2016, 2017 Arm Limited.
- *
- * SPDX-License-Identifier: MIT
- *
- * Permission is hereby granted, free of charge, to any person obtaining a copy
- * of this software and associated documentation files (the "Software"), to
- * deal in the Software without restriction, including without limitation the
- * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
- * sell copies of the Software, and to permit persons to whom the Software is
- * furnished to do so, subject to the following conditions:
- *
- * The above copyright notice and this permission notice shall be included in all
- * copies or substantial portions of the Software.
- *
- * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
- * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
- * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
- * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
- * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
- * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
- * SOFTWARE.
- */
-#include "helpers.h"
-
-/***********************************************/
-/* Begin implementation of Sobel3x3 filter */
-/***********************************************/
-
-/** This OpenCL kernel that computes a Sobel3x3 filter.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: U8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16
- * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16
- * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel3x3(
- IMAGE_DECLARATION(src)
-#ifdef GRAD_X
- ,
- IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- ,
- IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
- Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
- // Output pixels
-#ifdef GRAD_X
- short8 gx = (short8)0;
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- short8 gy = (short8)0;
-#endif /* GRAD_Y */
-
- // Row0
- uchar16 temp = vload16(0, offset(&src, -1, -1));
- short8 left = convert_short8(temp.s01234567);
- short8 middle = convert_short8(temp.s12345678);
- short8 right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
- gx += left * (short8)(-1);
- gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- gy += left * (short8)(-1);
- gy += middle * (short8)(-2);
- gy += right * (short8)(-1);
-#endif /* GRAD_Y */
-
- // Row1
- temp = vload16(0, offset(&src, -1, 0));
- left = convert_short8(temp.s01234567);
- right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
- gx += left * (short8)(-2);
- gx += right * (short8)(+2);
-#endif /* GRAD_X */
-
- // Row2
- temp = vload16(0, offset(&src, -1, 1));
- left = convert_short8(temp.s01234567);
- middle = convert_short8(temp.s12345678);
- right = convert_short8(temp.s23456789);
-#ifdef GRAD_X
- gx += left * (short8)(-1);
- gx += right * (short8)(+1);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- gy += left * (short8)(+1);
- gy += middle * (short8)(+2);
- gy += right * (short8)(+1);
-#endif /* GRAD_Y */
-
- // Store results
-#ifdef GRAD_X
- vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/* End implementation of Sobel3x3 filter */
-/**********************************************/
-
-/***********************************************/
-/* Begin implementation of Sobel5x5 filter */
-/***********************************************/
-
-/** Compute a 1D horizontal sobel filter 1x5 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src Pointer to source image.
- * @param[in] left1_coeff_gx Weight of the most left pixel for gx
- * @param[in] left2_coeff_gx Weight of the left pixel for gx
- * @param[in] middle_coeff_gx Weight of the middle pixel for gx
- * @param[in] right1_coeff_gx Weight of the right pixel for gx
- * @param[in] right2_coeff_gx Weight of the most right pixel for gx
- * @param[in] left1_coeff_gy Weight of the most left pixel for gy
- * @param[in] left2_coeff_gy Weight of the left pixel for gy
- * @param[in] middle_coeff_gy Weight of the middle pixel for gy
- * @param[in] right1_coeff_gy Weight of the right pixel for gy
- * @param[in] right2_coeff_gy Weight of the most right pixel for gy
- *
- * @return a short16 containing short8 gx and short8 gy values.
- */
-short16 sobel1x5(
- Image *src,
- const short left1_coeff_gx,
- const short left2_coeff_gx,
- const short middle_coeff_gx,
- const short right1_coeff_gx,
- const short right2_coeff_gx,
- const short left1_coeff_gy,
- const short left2_coeff_gy,
- const short middle_coeff_gy,
- const short right1_coeff_gy,
- const short right2_coeff_gy)
-{
- uchar16 temp = vload16(0, offset(src, -2, 0));
- short8 gx = 0;
- short8 gy = 0;
- short8 val;
-
- val = convert_short8(temp.s01234567);
- gx += val * (short8)left1_coeff_gx;
- gy += val * (short8)left1_coeff_gy;
-
- val = convert_short8(temp.s12345678);
- gx += val * (short8)left2_coeff_gx;
- gy += val * (short8)left2_coeff_gy;
-
- val = convert_short8(temp.s23456789);
- gx += val * (short8)middle_coeff_gx;
- gy += val * (short8)middle_coeff_gy;
-
- val = convert_short8(temp.s3456789a);
- gx += val * (short8)right1_coeff_gx;
- gy += val * (short8)right1_coeff_gy;
-
- val = convert_short8(temp.s456789ab);
- gx += val * (short8)right2_coeff_gx;
- gy += val * (short8)right2_coeff_gy;
-
- return (short16)(gx, gy);
-}
-
-/** Compute a 1D vertical sobel filter 5x1 for 8 bytes assuming the input is made of 1 channel of 1 byte (i.e 8 pixels).
- *
- * @param[in] src Pointer to source image.
- * @param[in] up1_coeff Weight of the most up pixel
- * @param[in] up2_coeff Weight of the up pixel
- * @param[in] middle_coeff Weight of the middle pixel
- * @param[in] down1_coeff Weight of the down pixel
- * @param[in] down2_coeff Weight of the most down pixel
- *
- * @return a short8 containing 8 convoluted values.
- */
-short8 sobel5x1(
- Image *src,
- const short up1_coeff,
- const short up2_coeff,
- const short middle_coeff,
- const short down1_coeff,
- const short down2_coeff)
-{
- short8 val;
- short8 out = (short8)0;
-
- val = vload8(0, (__global short *)offset(src, 0, -2));
- out += val * (short8)up1_coeff;
-
- val = vload8(0, (__global short *)offset(src, 0, -1));
- out += val * (short8)up2_coeff;
-
- val = vload8(0, (__global short *)offset(src, 0, 0));
- out += val * (short8)middle_coeff;
-
- val = vload8(0, (__global short *)offset(src, 0, 1));
- out += val * (short8)down1_coeff;
-
- val = vload8(0, (__global short *)offset(src, 0, 2));
- out += val * (short8)down2_coeff;
-
- return (short8)(out);
-}
-
-/** Apply a 1x5 sobel matrix to a single channel U8 input image and output two temporary channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in] src_ptr Pointer to the source image.. Supported data types: U8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_gx_ptr Pointer to the destination image.. Supported data types: S16
- * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16
- * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x5(
- IMAGE_DECLARATION(src)
-#ifdef GRAD_X
- ,
- IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- ,
- IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
- Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
- // Output pixels
- short16 gx_gy = sobel1x5(&src,
- -1, -2, 0, 2, 1,
- 1, 4, 6, 4, 1);
-
- // Store result in dst
-#ifdef GRAD_X
- vstore8(gx_gy.s01234567, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- vstore8(gx_gy.s89ABCDEF, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 5x1 convolution matrix to two single channel S16 input temporary images
- * and output two single channel S16 images.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in] src_x_ptr Pointer to the source image.. Supported data types: S16
- * @param[in] src_x_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_x_step_x src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_x_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_x_step_y src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_x_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16
- * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] src_y_ptr Pointer to the source image. Supported data types: S16
- * @param[in] src_y_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_y_step_x src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_y_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_y_step_y src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_y_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16
- * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] dummy Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable5x1(
-#ifdef GRAD_X
- IMAGE_DECLARATION(src_x),
- IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- IMAGE_DECLARATION(src_y),
- IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
- int dummy)
-{
-#ifdef GRAD_X
- Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x);
- Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y);
- Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
-#ifdef GRAD_X
- short8 gx = sobel5x1(&src_x,
- 1, 4, 6, 4, 1);
- vstore8(gx, 0, ((__global short *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- short8 gy = sobel5x1(&src_y,
- -1, -2, 0, 2, 1);
- vstore8(gy, 0, ((__global short *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/* End implementation of Sobel5x5 filter */
-/**********************************************/
-
-/***********************************************/
-/* Begin implementation of Sobel7x7 filter */
-/***********************************************/
-
-/* Sobel 1x7 horizontal X / 7x1 vertical Y coefficients */
-#define X0 -1
-#define X1 -4
-#define X2 -5
-#define X3 0
-#define X4 5
-#define X5 4
-#define X6 1
-
-/* Sobel 1x7 vertical X / 7x1 horizontal Y coefficients */
-#define Y0 1
-#define Y1 6
-#define Y2 15
-#define Y3 20
-#define Y4 15
-#define Y5 6
-#define Y6 1
-
-/* Calculates single horizontal iteration. */
-#define SOBEL1x1_HOR(src, gx, gy, idx) \
- { \
- int8 val = convert_int8(vload8(0, offset(src, idx - 3, 0))); \
- gx += val * X##idx; \
- gy += val * Y##idx; \
- }
-
-/* Calculates single vertical iteration. */
-#define SOBEL1x1_VERT(src, g, direction, idx) \
- { \
- int8 val = vload8(0, (__global int *)offset(src, 0, idx - 3)); \
- g += val * (int8)direction##idx; \
- }
-
-/* Calculates a 1x7 horizontal iteration. */
-#define SOBEL1x7(ptr, gx, gy) \
- SOBEL1x1_HOR(ptr, gx, gy, 0) \
- SOBEL1x1_HOR(ptr, gx, gy, 1) \
- SOBEL1x1_HOR(ptr, gx, gy, 2) \
- SOBEL1x1_HOR(ptr, gx, gy, 3) \
- SOBEL1x1_HOR(ptr, gx, gy, 4) \
- SOBEL1x1_HOR(ptr, gx, gy, 5) \
- SOBEL1x1_HOR(ptr, gx, gy, 6)
-
-/* Calculates a 7x1 vertical iteration. */
-#define SOBEL7x1(ptr, g, direction) \
- SOBEL1x1_VERT(ptr, g, direction, 0) \
- SOBEL1x1_VERT(ptr, g, direction, 1) \
- SOBEL1x1_VERT(ptr, g, direction, 2) \
- SOBEL1x1_VERT(ptr, g, direction, 3) \
- SOBEL1x1_VERT(ptr, g, direction, 4) \
- SOBEL1x1_VERT(ptr, g, direction, 5) \
- SOBEL1x1_VERT(ptr, g, direction, 6)
-
-/** Apply a 1x7 sobel matrix to a single channel U8 input image and output two temporary channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in] src_ptr Pointer to the source image. Supported data types: U8
- * @param[in] src_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_step_y src_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S32
- * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S32
- * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- */
-__kernel void sobel_separable1x7(
- IMAGE_DECLARATION(src)
-#ifdef GRAD_X
- ,
- IMAGE_DECLARATION(dst_gx)
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- ,
- IMAGE_DECLARATION(dst_gy)
-#endif /* GRAD_Y */
-)
-{
- Image src = CONVERT_TO_IMAGE_STRUCT(src);
-#ifdef GRAD_X
- Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
- int8 gx = (int8)0;
- int8 gy = (int8)0;
-
- SOBEL1x7(&src, gx, gy);
-
- // Store result in dst
-#ifdef GRAD_X
- vstore8(gx, 0, ((__global int *)dst_gx.ptr));
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- vstore8(gy, 0, ((__global int *)dst_gy.ptr));
-#endif /* GRAD_Y */
-}
-
-/** Apply a 7x1 convolution matrix to two single channel S16 input temporary images and output two single channel S16 images and leave the borders undefined.
- *
- * @attention To enable computation of the X gradient -DGRAD_X must be passed at compile time, while computation of the Y gradient
- * is performed when -DGRAD_Y is used. You can use both when computation of both gradients is required.
- *
- * @param[in] src_x_ptr Pointer to the source image. Supported data types: S32
- * @param[in] src_x_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_x_step_x src_x_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_x_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_x_step_y src_x_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_x_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_gx_ptr Pointer to the destination image. Supported data types: S16
- * @param[in] dst_gx_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gx_step_x dst_gx_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gx_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gx_step_y dst_gx_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gx_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] src_y_ptr Pointer to the source image. Supported data types: S32
- * @param[in] src_y_stride_x Stride of the source image in X dimension (in bytes)
- * @param[in] src_y_step_x src_y_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] src_y_stride_y Stride of the source image in Y dimension (in bytes)
- * @param[in] src_y_step_y src_y_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] src_y_offset_first_element_in_bytes The offset of the first element in the source image
- * @param[out] dst_gy_ptr Pointer to the destination image. Supported data types: S16
- * @param[in] dst_gy_stride_x Stride of the destination image in X dimension (in bytes)
- * @param[in] dst_gy_step_x dst_gy_stride_x * number of elements along X processed per workitem(in bytes)
- * @param[in] dst_gy_stride_y Stride of the destination image in Y dimension (in bytes)
- * @param[in] dst_gy_step_y dst_gy_stride_y * number of elements along Y processed per workitem(in bytes)
- * @param[in] dst_gy_offset_first_element_in_bytes The offset of the first element in the destination image
- * @param[in] dummy Dummy parameter to easy conditional inclusion
- */
-__kernel void sobel_separable7x1(
-#ifdef GRAD_X
- IMAGE_DECLARATION(src_x),
- IMAGE_DECLARATION(dst_gx),
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- IMAGE_DECLARATION(src_y),
- IMAGE_DECLARATION(dst_gy),
-#endif /* GRAD_Y */
- int dummy)
-{
-#ifdef GRAD_X
- Image src_x = CONVERT_TO_IMAGE_STRUCT(src_x);
- Image dst_gx = CONVERT_TO_IMAGE_STRUCT(dst_gx);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- Image src_y = CONVERT_TO_IMAGE_STRUCT(src_y);
- Image dst_gy = CONVERT_TO_IMAGE_STRUCT(dst_gy);
-#endif /* GRAD_Y */
-
- // Output pixels
-#ifdef GRAD_X
- int8 gx = 0;
- SOBEL7x1(&src_x, gx, Y);
- vstore8(gx, 0, (__global int *)dst_gx.ptr);
-#endif /* GRAD_X */
-#ifdef GRAD_Y
- int8 gy = 0;
- SOBEL7x1(&src_y, gy, X);
- vstore8(gy, 0, (__global int *)dst_gy.ptr);
-#endif /* GRAD_Y */
-}
-
-/**********************************************/
-/* End implementation of Sobel7x7 filter */
-/**********************************************/
diff --git a/src/core/gpu/cl/ClKernelLibrary.cpp b/src/core/gpu/cl/ClKernelLibrary.cpp
index 73da93c1f5..1ba307cb1c 100644
--- a/src/core/gpu/cl/ClKernelLibrary.cpp
+++ b/src/core/gpu/cl/ClKernelLibrary.cpp
@@ -177,484 +177,508 @@ namespace opencl
{
const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{
- { "activation_layer", "activation_layer.cl" },
- { "activation_layer_quant", "activation_layer_quant.cl" },
- { "activation_layer_quant_f32", "activation_layer_quant.cl" },
- { "arg_min_max_x", "arg_min_max.cl" },
- { "arg_min_max_y", "arg_min_max.cl" },
- { "arg_min_max_z", "arg_min_max.cl" },
- { "arg_min_max_w", "arg_min_max.cl" },
- { "batch_to_space_nchw", "batch_to_space.cl" },
- { "batch_to_space_static_nchw", "batch_to_space.cl" },
- { "batch_to_space_nhwc", "batch_to_space.cl" },
- { "batch_to_space_static_nhwc", "batch_to_space.cl" },
- { "batchnormalization_layer_nchw", "batchnormalization_layer.cl" },
- { "batchnormalization_layer_nhwc", "batchnormalization_layer.cl" },
- { "bitwise_or", "bitwise_op.cl" },
- { "bitwise_and", "bitwise_op.cl" },
- { "bitwise_xor", "bitwise_op.cl" },
- { "bitwise_not", "bitwise_op.cl" },
- { "bounding_box_transform", "bounding_box_transform.cl" },
- { "bounding_box_transform_quantized", "bounding_box_transform_quantized.cl" },
- { "channel_shuffle_nchw", "channel_shuffle.cl" },
- { "channel_shuffle_nhwc", "channel_shuffle.cl" },
- { "compare_equal", "comparisons.cl" },
- { "compare_equal_quantized", "comparisons.cl" },
- { "compare_notequal", "comparisons.cl" },
- { "compare_notequal_quantized", "comparisons.cl" },
- { "compare_greater", "comparisons.cl" },
- { "compare_greater_quantized", "comparisons.cl" },
- { "compare_greaterequal", "comparisons.cl" },
- { "compare_greaterequal_quantized", "comparisons.cl" },
- { "compare_less", "comparisons.cl" },
- { "compare_less_quantized", "comparisons.cl" },
- { "compare_lessequal", "comparisons.cl" },
- { "compare_lessequal_quantized", "comparisons.cl" },
- { "concatenate", "concatenate.cl" },
- { "concatenate_width", "concatenate.cl" },
- { "concatenate_height", "concatenate.cl" },
- { "concatenate_width_x2", "concatenate.cl" },
- { "concatenate_width_x4", "concatenate.cl" },
- { "col2im", "col2im.cl" },
- { "cast_down", "cast.cl" },
- { "cast_up", "cast.cl" },
- { "convert_fc_weights", "convert_fc_weights.cl" },
- { "copy_tensor", "copy_tensor.cl" },
- { "crop_tensor", "crop_tensor.cl" },
- { "deconvolution_reshape", "deconvolution_layer.cl" },
- { "deconvolution_upsample", "deconvolution_layer.cl" },
- { "depth_to_space_nchw", "depth_to_space.cl" },
- { "depth_to_space_nhwc", "depth_to_space.cl" },
- { "dequantization_layer", "dequantization_layer.cl" },
- { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" },
- { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" },
- { "dwc_native_fp_nhwc", "dwc_native_fp_nhwc.cl" },
- { "dwc_native_quantized_nhwc", "dwc_native_quantized_nhwc.cl" },
- { "direct_convolution_nhwc", "direct_convolution.cl" },
- { "direct_convolution1x1", "direct_convolution1x1.cl" },
- { "direct_convolution1x1_f32_bifrost", "direct_convolution1x1.cl" },
- { "direct_convolution3x3", "direct_convolution3x3.cl" },
- { "direct_convolution3x3_f32_bifrost", "direct_convolution3x3.cl" },
- { "direct_convolution5x5", "direct_convolution5x5.cl" },
- { "direct_convolution5x5_f32_bifrost", "direct_convolution5x5.cl" },
- { "direct_convolution_quantized", "direct_convolution_quantized.cl" },
- { "elementwise_operation_ADD", "elementwise_operation.cl" },
- { "elementwise_operation_SUB", "elementwise_operation.cl" },
- { "elementwise_operation_MAX", "elementwise_operation.cl" },
- { "elementwise_operation_MIN", "elementwise_operation.cl" },
- { "elementwise_operation_DIV", "elementwise_operation.cl" },
- { "elementwise_operation_SQUARED_DIFF", "elementwise_operation.cl" },
- { "elementwise_operation_POWER", "elementwise_operation.cl" },
- { "elementwise_operation_PRELU", "elementwise_operation.cl" },
- { "elementwise_operation_AND", "elementwise_operation.cl" },
- { "elementwise_operation_OR", "elementwise_operation.cl" },
- { "elementwise_operation_ADD_quantized", "elementwise_operation_quantized.cl" },
- { "elementwise_operation_SUB_quantized", "elementwise_operation_quantized.cl" },
- { "elementwise_operation_MAX_quantized", "elementwise_operation_quantized.cl" },
- { "elementwise_operation_MIN_quantized", "elementwise_operation_quantized.cl" },
- { "elementwise_operation_DIV_quantized", "elementwise_operation_quantized.cl" },
- { "elementwise_operation_SQUARED_DIFF_quantized", "elementwise_operation_quantized.cl" },
- { "elementwise_operation_PRELU_quantized", "elementwise_operation_quantized.cl" },
- { "elementwise_unary", "elementwise_unary.cl" },
- { "fft_digit_reverse_axis_0", "fft_digit_reverse.cl" },
- { "fft_digit_reverse_axis_1", "fft_digit_reverse.cl" },
- { "fft_radix_2_first_stage_axis_0", "fft.cl" },
- { "fft_radix_2_first_stage_axis_1", "fft.cl" },
- { "fft_radix_2_axis_0", "fft.cl" },
- { "fft_radix_2_axis_1", "fft.cl" },
- { "fft_radix_3_first_stage_axis_0", "fft.cl" },
- { "fft_radix_3_first_stage_axis_1", "fft.cl" },
- { "fft_radix_3_axis_0", "fft.cl" },
- { "fft_radix_3_axis_1", "fft.cl" },
- { "fft_radix_4_first_stage_axis_0", "fft.cl" },
- { "fft_radix_4_first_stage_axis_1", "fft.cl" },
- { "fft_radix_4_axis_0", "fft.cl" },
- { "fft_radix_4_axis_1", "fft.cl" },
- { "fft_radix_5_first_stage_axis_0", "fft.cl" },
- { "fft_radix_5_first_stage_axis_1", "fft.cl" },
- { "fft_radix_5_axis_0", "fft.cl" },
- { "fft_radix_5_axis_1", "fft.cl" },
- { "fft_radix_7_first_stage_axis_0", "fft.cl" },
- { "fft_radix_7_first_stage_axis_1", "fft.cl" },
- { "fft_radix_7_axis_0", "fft.cl" },
- { "fft_radix_7_axis_1", "fft.cl" },
- { "fft_radix_8_first_stage_axis_0", "fft.cl" },
- { "fft_radix_8_first_stage_axis_1", "fft.cl" },
- { "fft_radix_8_axis_0", "fft.cl" },
- { "fft_radix_8_axis_1", "fft.cl" },
- { "fft_scale_conj", "fft_scale.cl" },
- { "fill_image_borders_constant", "fill_border.cl" },
- { "fill_image_borders_replicate", "fill_border.cl" },
- { "floor_layer", "floor.cl" },
- { "fuse_batchnormalization_layer", "batchnormalization_layer.cl" },
- { "gather", "gather.cl" },
- { "gemm_ma_f16", "gemm.cl" },
- { "gemm_ma_f32", "gemm.cl" },
- { "gemm_mv", "gemv.cl" },
- { "gemm_mv_quantized", "gemv.cl" },
- { "gemm_mm_interleaved_transposed_f16", "gemm_v1.cl" },
- { "gemm_mm_interleaved_transposed_f16_acc32", "gemm_v1.cl" },
- { "gemm_mm_interleaved_transposed_f16_bifrost", "gemm_v1.cl" },
- { "gemm_mm_interleaved_transposed_f32", "gemm_v1.cl" },
- { "gemm_mm_interleaved_transposed_f32_bifrost", "gemm_v1.cl" },
- { "gemm_mm_floating_point", "gemm_v1.cl" },
- { "gemm_mm_floating_point_f16_bifrost", "gemm_v1.cl" },
- { "gemm_mm_floating_point_f16_bifrost_acc32", "gemm_v1.cl" },
- { "gemm_mm_floating_point_f32_bifrost", "gemm_v1.cl" },
- { "gemm_mm_floating_point_f32_bifrost_1000", "gemm_v1.cl" },
- { "gemm_mm_native", "gemm.cl" },
- { "gemm_mm_reshaped_lhs_nt_rhs_t", "gemm.cl" },
- { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "gemm.cl" },
- { "gemm_mm_reshaped_lhs_t_rhs_nt", "gemm.cl" },
- { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "gemm.cl" },
- { "gemm_mm_reshaped_only_rhs_nt", "gemm.cl" },
- { "gemm_mm_reshaped_only_rhs_nt_texture", "gemm.cl" },
- { "gemm_mm_reshaped_only_rhs_t", "gemm.cl" },
- { "gemm_mm_reshaped_only_rhs_t_texture", "gemm.cl" },
- { "gemm_lc_vm_f32", "gemm.cl" },
- { "gemm_reshape_lhs_matrix_nt", "gemm.cl" },
- { "gemm_reshape_lhs_matrix_t", "gemm.cl" },
- { "gemm_reshape_rhs_matrix_nt", "gemm.cl" },
- { "gemm_reshape_rhs_matrix_t", "gemm.cl" },
- { "gemmlowp_matrix_a_reduction", "gemmlowp.cl" },
- { "gemmlowp_matrix_a_reduction_dot8", "gemmlowp.cl" },
- { "gemmlowp_matrix_b_reduction", "gemmlowp.cl" },
- { "gemmlowp_mm_native", "gemmlowp.cl" },
- { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "gemmlowp.cl" },
- { "gemmlowp_mm_reshaped_only_rhs_t", "gemmlowp.cl" },
- { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "gemmlowp.cl" },
- { "gemmlowp_offset_contribution", "gemmlowp.cl" },
- { "gemmlowp_offset_contribution_quantize_down", "gemmlowp.cl" },
- { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "gemmlowp.cl" },
- { "gemmlowp_output_stage_quantize_down", "gemmlowp.cl" },
- { "gemmlowp_output_stage_quantize_down_fixedpoint", "gemmlowp.cl" },
- { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "gemmlowp.cl" },
- { "gemmlowp_output_stage_quantize_down_float", "gemmlowp.cl" },
- { "generate_proposals_compute_all_anchors", "generate_proposals.cl" },
- { "generate_proposals_compute_all_anchors_quantized", "generate_proposals_quantized.cl" },
- { "im2col1x1_stridex1_nchw", "im2col.cl" },
- { "im2col3x3_nchw", "im2col.cl" },
- { "im2col5x5_nchw", "im2col.cl" },
- { "im2col11x11_padx0_pady0_nchw", "im2col.cl" },
- { "im2col_generic_nchw", "im2col.cl" },
- { "im2col_generic_padx0_pady0_nchw", "im2col.cl" },
- { "im2col3x3_nhwc", "im2col.cl" },
- { "im2col9x9_nhwc", "im2col.cl" },
- { "im2col_generic_nhwc", "im2col.cl" },
- { "instance_normalization", "instance_normalization.cl" },
- { "compute_mean_var", "instance_normalization.cl" },
- { "l2_normalize_x", "l2_normalize.cl" },
- { "l2_normalize_y", "l2_normalize.cl" },
- { "l2_normalize_z", "l2_normalize.cl" },
- { "max_unpooling_layer_2", "unpooling_layer.cl" },
- { "mean_stddev_normalization", "mean_stddev_normalization.cl" },
- { "memset", "memset.cl" },
- { "minmax_layer", "minmax_layer.cl" },
- { "non_max_suppression", "nonmax.cl" },
- { "normalization_layer_cross_map_nchw", "normalization_layer.cl" },
- { "normalization_layer_cross_map_nhwc", "normalization_layer.cl" },
- { "normalization_layer_in_map_nchw", "normalization_layer.cl" },
- { "normalization_layer_in_map_nhwc", "normalization_layer.cl" },
- { "normalize_planar_yuv_layer_nchw", "normalize_planar_yuv_layer.cl" },
- { "normalize_planar_yuv_layer_nhwc", "normalize_planar_yuv_layer.cl" },
- { "normalize_planar_yuv_layer_q8_nchw", "normalize_planar_yuv_layer_quantized.cl" },
- { "normalize_planar_yuv_layer_q8_nhwc", "normalize_planar_yuv_layer_quantized.cl" },
- { "pad_layer_constant", "pad_layer.cl" },
- { "pad_layer_symmetric_reflect", "pad_layer.cl" },
- { "permute", "permute.cl" },
- { "pixelwise_mul_complex", "pixelwise_mul_float.cl" },
- { "pixelwise_mul_float", "pixelwise_mul_float.cl" },
- { "pixelwise_mul_int", "pixelwise_mul_int.cl" },
- { "pixelwise_mul_quantized", "pixelwise_mul_int.cl" },
- { "pooling_layer_2", "pooling_layer.cl" },
- { "pooling_layer_3", "pooling_layer.cl" },
- { "pooling_layer_optimized_3", "pooling_layer.cl" },
- { "pooling_layer_7", "pooling_layer.cl" },
- { "pooling_layer_MxN_nchw", "pooling_layer.cl" },
- { "pooling_layer_MxN_nhwc", "pooling_layer.cl" },
- { "pooling_layer_2x2_nhwc", "pooling_layer.cl" },
- { "pooling_layer_2_nchw_indices_fp32", "pooling_layer.cl" },
- { "pooling_layer_2_nchw_indices_fp16", "pooling_layer.cl" },
- { "pooling_layer_MxN_quantized_nhwc", "pooling_layer_quantized.cl" },
- { "pooling_layer_MxN_quantized_nchw", "pooling_layer_quantized.cl" },
- { "prior_box_layer_nchw", "prior_box_layer.cl" },
- { "qlstm_layer_normalization", "qlstm_layer_normalization.cl" },
- { "quantization_layer", "quantization_layer.cl" },
- { "range", "range.cl" },
- { "range_quantized", "range.cl" },
- { "reduction_operation_x", "reduction_operation.cl" },
- { "reduction_operation_non_parallel_x", "reduction_operation.cl" },
- { "reduction_operation_y", "reduction_operation.cl" },
- { "reduction_operation_z", "reduction_operation.cl" },
- { "reduction_operation_w", "reduction_operation.cl" },
- { "remap_nearest_neighbour_nchw", "remap.cl" },
- { "remap_bilinear_nchw", "remap.cl" },
- { "remap_nearest_neighbour_nhwc", "remap.cl" },
- { "remap_bilinear_nhwc", "remap.cl" },
- { "reorg_layer_nchw", "reorg_layer.cl" },
- { "reorg_layer_nhwc", "reorg_layer.cl" },
- { "reshape_layer", "reshape_layer.cl" },
- { "reshape_to_columns", "convolution_layer.cl" },
- { "reverse", "reverse.cl" },
- { "roi_align_layer", "roi_align_layer.cl" },
- { "roi_align_layer_quantized", "roi_align_layer_quantized.cl" },
- { "roi_pooling_layer", "roi_pooling_layer.cl" },
- { "scale_nearest_neighbour_nchw", "scale.cl" },
- { "scale_nearest_neighbour_nhwc", "scale.cl" },
- { "scale_bilinear_nchw", "scale.cl" },
- { "scale_bilinear_nhwc", "scale.cl" },
- { "scale_bilinear_quantized_nchw", "scale_quantized.cl" },
- { "scale_bilinear_quantized_nhwc", "scale_quantized.cl" },
- { "select_same_rank", "select.cl" },
- { "select_different_rank_2", "select.cl" },
- { "select_different_rank_n", "select.cl" },
- { "softmax_layer_norm", "softmax_layer.cl" },
- { "softmax_layer_norm_quantized", "softmax_layer_quantized.cl" },
- { "softmax_layer_max_shift_exp_sum_quantized_serial", "softmax_layer_quantized.cl" },
- { "softmax_layer_max_shift_exp_sum_quantized_parallel", "softmax_layer_quantized.cl" },
- { "softmax_layer_max_shift_exp_sum_serial", "softmax_layer.cl" },
- { "space_to_batch_nchw", "space_to_batch.cl" },
- { "space_to_batch_static_nchw", "space_to_batch.cl" },
- { "space_to_batch_nhwc", "space_to_batch.cl" },
- { "space_to_batch_static_nhwc", "space_to_batch.cl" },
- { "space_to_depth_nchw", "space_to_depth.cl" },
- { "space_to_depth_nhwc", "space_to_depth.cl" },
- { "softmax_layer_max_shift_exp_sum_parallel", "softmax_layer.cl" },
- { "stack_layer", "stack_layer.cl" },
- { "strided_slice", "slice_ops.cl" },
- { "tile", "tile.cl" },
- { "transpose", "transpose.cl" },
- { "upsample_layer_nchw", "upsample_layer.cl" },
- { "upsample_layer_nhwc", "upsample_layer.cl" },
- { "winograd_filter_transform_2x2_3x3_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_2x1_3x1_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_1x2_1x3_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x4_3x3_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x1_3x1_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_1x4_1x3_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x4_5x5_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x1_5x1_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_1x4_1x5_nchw", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x1_3x1_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_1x4_1x3_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x4_3x3_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x4_5x5_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_4x1_5x1_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_1x4_1x5_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_2x2_7x7_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_2x1_7x1_nhwc", "winograd_filter_transform.cl" },
- { "winograd_filter_transform_1x2_1x7_nhwc", "winograd_filter_transform.cl" },
- { "winograd_input_transform_2x2_3x3_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_2x2_3x3_stepz2_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_2x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_2x1_3x1_stepz2_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_1x2_1x3_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_1x2_1x3_stepz2_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x4_3x3_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x1_3x1_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_1x4_1x3_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x4_5x5_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x1_5x1_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_1x4_1x5_stepz1_nchw", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "winograd_input_transform.cl" },
- { "winograd_output_transform_2x2_3x3_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_2x1_3x1_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_1x2_1x3_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x4_3x3_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x1_3x1_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_1x4_1x3_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x4_5x5_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x1_5x1_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_1x4_1x5_nchw", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x1_3x1_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_1x4_1x3_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x4_3x3_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x4_5x5_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_4x1_5x1_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_1x4_1x5_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_2x2_7x7_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_2x1_7x1_nhwc", "winograd_output_transform.cl" },
- { "winograd_output_transform_1x2_1x7_nhwc", "winograd_output_transform.cl" },
+ { "activation_layer", "common/activation_layer.cl" },
+ { "activation_layer_quant", "common/activation_layer_quant.cl" },
+ { "activation_layer_quant_f32", "common/activation_layer_quant.cl" },
+ { "arg_min_max_x", "common/arg_min_max.cl" },
+ { "arg_min_max_y", "common/arg_min_max.cl" },
+ { "arg_min_max_z", "common/arg_min_max.cl" },
+ { "arg_min_max_w", "common/arg_min_max.cl" },
+ { "batch_to_space_nchw", "nchw/batch_to_space.cl" },
+ { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" },
+ { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" },
+ { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" },
+ { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" },
+ { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" },
+ { "bitwise_or", "common/bitwise_op.cl" },
+ { "bitwise_and", "common/bitwise_op.cl" },
+ { "bitwise_xor", "common/bitwise_op.cl" },
+ { "bitwise_not", "common/bitwise_op.cl" },
+ { "bounding_box_transform", "common/bounding_box_transform.cl" },
+ { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" },
+ { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" },
+ { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" },
+ { "compare_equal", "common/comparisons.cl" },
+ { "compare_equal_quantized", "common/comparisons.cl" },
+ { "compare_notequal", "common/comparisons.cl" },
+ { "compare_notequal_quantized", "common/comparisons.cl" },
+ { "compare_greater", "common/comparisons.cl" },
+ { "compare_greater_quantized", "common/comparisons.cl" },
+ { "compare_greaterequal", "common/comparisons.cl" },
+ { "compare_greaterequal_quantized", "common/comparisons.cl" },
+ { "compare_less", "common/comparisons.cl" },
+ { "compare_less_quantized", "common/comparisons.cl" },
+ { "compare_lessequal", "common/comparisons.cl" },
+ { "compare_lessequal_quantized", "common/comparisons.cl" },
+ { "concatenate", "common/concatenate.cl" },
+ { "concatenate_width", "common/concatenate.cl" },
+ { "concatenate_height", "common/concatenate.cl" },
+ { "concatenate_width_x2", "common/concatenate.cl" },
+ { "concatenate_width_x4", "common/concatenate.cl" },
+ { "col2im", "common/col2im.cl" },
+ { "cast_down", "common/cast.cl" },
+ { "cast_up", "common/cast.cl" },
+ { "convert_fc_weights", "common/convert_fc_weights.cl" },
+ { "copy_tensor", "common/copy_tensor.cl" },
+ { "crop_tensor", "common/crop_tensor.cl" },
+ { "deconvolution_reshape", "common/deconvolution_layer.cl" },
+ { "deconvolution_upsample", "common/deconvolution_layer.cl" },
+ { "depth_to_space_nchw", "nchw/depth_to_space.cl" },
+ { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" },
+ { "dequantization_layer", "common/dequantization_layer.cl" },
+ { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" },
+ { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" },
+ { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" },
+ { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" },
+ { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" },
+ { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" },
+ { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" },
+ { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" },
+ { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" },
+ { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" },
+ { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" },
+ { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" },
+ { "elementwise_operation_ADD", "common/elementwise_operation.cl" },
+ { "elementwise_operation_SUB", "common/elementwise_operation.cl" },
+ { "elementwise_operation_MAX", "common/elementwise_operation.cl" },
+ { "elementwise_operation_MIN", "common/elementwise_operation.cl" },
+ { "elementwise_operation_DIV", "common/elementwise_operation.cl" },
+ { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" },
+ { "elementwise_operation_POWER", "common/elementwise_operation.cl" },
+ { "elementwise_operation_PRELU", "common/elementwise_operation.cl" },
+ { "elementwise_operation_AND", "common/elementwise_operation.cl" },
+ { "elementwise_operation_OR", "common/elementwise_operation.cl" },
+ { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" },
+ { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" },
+ { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" },
+ { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" },
+ { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" },
+ { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
+ { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
+ { "elementwise_unary", "common/elementwise_unary.cl" },
+ { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
+ { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
+ { "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
+ { "fft_radix_2_first_stage_axis_1", "common/fft.cl" },
+ { "fft_radix_2_axis_0", "common/fft.cl" },
+ { "fft_radix_2_axis_1", "common/fft.cl" },
+ { "fft_radix_3_first_stage_axis_0", "common/fft.cl" },
+ { "fft_radix_3_first_stage_axis_1", "common/fft.cl" },
+ { "fft_radix_3_axis_0", "common/fft.cl" },
+ { "fft_radix_3_axis_1", "common/fft.cl" },
+ { "fft_radix_4_first_stage_axis_0", "common/fft.cl" },
+ { "fft_radix_4_first_stage_axis_1", "common/fft.cl" },
+ { "fft_radix_4_axis_0", "common/fft.cl" },
+ { "fft_radix_4_axis_1", "common/fft.cl" },
+ { "fft_radix_5_first_stage_axis_0", "common/fft.cl" },
+ { "fft_radix_5_first_stage_axis_1", "common/fft.cl" },
+ { "fft_radix_5_axis_0", "common/fft.cl" },
+ { "fft_radix_5_axis_1", "common/fft.cl" },
+ { "fft_radix_7_first_stage_axis_0", "common/fft.cl" },
+ { "fft_radix_7_first_stage_axis_1", "common/fft.cl" },
+ { "fft_radix_7_axis_0", "common/fft.cl" },
+ { "fft_radix_7_axis_1", "common/fft.cl" },
+ { "fft_radix_8_first_stage_axis_0", "common/fft.cl" },
+ { "fft_radix_8_first_stage_axis_1", "common/fft.cl" },
+ { "fft_radix_8_axis_0", "common/fft.cl" },
+ { "fft_radix_8_axis_1", "common/fft.cl" },
+ { "fft_scale_conj", "common/fft_scale.cl" },
+ { "fill_image_borders_constant", "common/fill_border.cl" },
+ { "fill_image_borders_replicate", "common/fill_border.cl" },
+ { "floor_layer", "common/floor.cl" },
+ { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" },
+ { "gather", "common/gather.cl" },
+ { "gemm_ma_f16", "common/gemm.cl" },
+ { "gemm_ma_f32", "common/gemm.cl" },
+ { "gemm_mv", "common/gemv.cl" },
+ { "gemm_mv_quantized", "common/gemv.cl" },
+ { "gemm_mm_interleaved_transposed_f16", "common/gemm_v1.cl" },
+ { "gemm_mm_interleaved_transposed_f16_acc32", "common/gemm_v1.cl" },
+ { "gemm_mm_interleaved_transposed_f16_bifrost", "common/gemm_v1.cl" },
+ { "gemm_mm_interleaved_transposed_f32", "common/gemm_v1.cl" },
+ { "gemm_mm_interleaved_transposed_f32_bifrost", "common/gemm_v1.cl" },
+ { "gemm_mm_floating_point", "common/gemm_v1.cl" },
+ { "gemm_mm_floating_point_f16_bifrost", "common/gemm_v1.cl" },
+ { "gemm_mm_floating_point_f16_bifrost_acc32", "common/gemm_v1.cl" },
+ { "gemm_mm_floating_point_f32_bifrost", "common/gemm_v1.cl" },
+ { "gemm_mm_floating_point_f32_bifrost_1000", "common/gemm_v1.cl" },
+ { "gemm_mm_native", "common/gemm.cl" },
+ { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" },
+ { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" },
+ { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" },
+ { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" },
+ { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" },
+ { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" },
+ { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" },
+ { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" },
+ { "gemm_lc_vm_f32", "common/gemm.cl" },
+ { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" },
+ { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" },
+ { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" },
+ { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" },
+ { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" },
+ { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" },
+ { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" },
+ { "gemmlowp_mm_native", "common/gemmlowp.cl" },
+ { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" },
+ { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" },
+ { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" },
+ { "gemmlowp_offset_contribution", "common/gemmlowp.cl" },
+ { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" },
+ { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" },
+ { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" },
+ { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" },
+ { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" },
+ { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" },
+ { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" },
+ { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" },
+ { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" },
+ { "im2col3x3_nchw", "nchw/im2col.cl" },
+ { "im2col5x5_nchw", "nchw/im2col.cl" },
+ { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" },
+ { "im2col_generic_nchw", "nchw/im2col.cl" },
+ { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" },
+ { "im2col3x3_nhwc", "nhwc/im2col.cl" },
+ { "im2col9x9_nhwc", "nhwc/im2col.cl" },
+ { "im2col_generic_nhwc", "nhwc/im2col.cl" },
+ { "instance_normalization", "common/instance_normalization.cl" },
+ { "compute_mean_var", "common/instance_normalization.cl" },
+ { "l2_normalize_x", "common/l2_normalize.cl" },
+ { "l2_normalize_y", "common/l2_normalize.cl" },
+ { "l2_normalize_z", "common/l2_normalize.cl" },
+ { "max_unpooling_layer_2", "common/unpooling_layer.cl" },
+ { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" },
+ { "memset", "common/memset.cl" },
+ { "minmax_layer", "common/minmax_layer.cl" },
+ { "non_max_suppression", "common/nonmax.cl" },
+ { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" },
+ { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" },
+ { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" },
+ { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" },
+ { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" },
+ { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" },
+ { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" },
+ { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" },
+ { "pad_layer_constant", "common/pad_layer.cl" },
+ { "pad_layer_symmetric_reflect", "common/pad_layer.cl" },
+ { "permute", "common/permute.cl" },
+ { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" },
+ { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" },
+ { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" },
+ { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" },
+ { "pooling_layer_2", "common/pooling_layer.cl" },
+ { "pooling_layer_3", "common/pooling_layer.cl" },
+ { "pooling_layer_optimized_3", "common/pooling_layer.cl" },
+ { "pooling_layer_7", "common/pooling_layer.cl" },
+ { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" },
+ { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" },
+ { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" },
+ { "pooling_layer_2_nchw_indices_fp32", "nchw/pooling_layer.cl" },
+ { "pooling_layer_2_nchw_indices_fp16", "nchw/pooling_layer.cl" },
+ { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" },
+ { "pooling_layer_MxN_quantized_nchw", "nchw/pooling_layer_quantized.cl" },
+ { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" },
+ { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" },
+ { "quantization_layer", "common/quantization_layer.cl" },
+ { "range", "common/range.cl" },
+ { "range_quantized", "common/range.cl" },
+ { "reduction_operation_x", "common/reduction_operation.cl" },
+ { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" },
+ { "reduction_operation_y", "common/reduction_operation.cl" },
+ { "reduction_operation_z", "common/reduction_operation.cl" },
+ { "reduction_operation_w", "common/reduction_operation.cl" },
+ { "remap_nearest_neighbour_nchw", "nchw/remap.cl" },
+ { "remap_bilinear_nchw", "nchw/remap.cl" },
+ { "remap_nearest_neighbour_nhwc", "nhwc/remap.cl" },
+ { "remap_bilinear_nhwc", "nhwc/remap.cl" },
+ { "reorg_layer_nchw", "nchw/reorg_layer.cl" },
+ { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" },
+ { "reshape_layer", "common/reshape_layer.cl" },
+ { "reshape_to_columns", "common/convolution_layer.cl" },
+ { "reverse", "common/reverse.cl" },
+ { "roi_align_layer", "common/roi_align_layer.cl" },
+ { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" },
+ { "roi_pooling_layer", "common/roi_pooling_layer.cl" },
+ { "scale_nearest_neighbour_nchw", "nchw/scale.cl" },
+ { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" },
+ { "scale_bilinear_nchw", "nchw/scale.cl" },
+ { "scale_bilinear_nhwc", "nhwc/scale.cl" },
+ { "scale_bilinear_quantized_nchw", "nchw/scale_quantized.cl" },
+ { "scale_bilinear_quantized_nhwc", "nhwc/scale_quantized.cl" },
+ { "select_same_rank", "common/select.cl" },
+ { "select_different_rank_2", "common/select.cl" },
+ { "select_different_rank_n", "common/select.cl" },
+ { "softmax_layer_norm", "common/softmax_layer.cl" },
+ { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" },
+ { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" },
+ { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" },
+ { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" },
+ { "space_to_batch_nchw", "nchw/space_to_batch.cl" },
+ { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" },
+ { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" },
+ { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" },
+ { "space_to_depth_nchw", "nchw/space_to_depth.cl" },
+ { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" },
+ { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" },
+ { "stack_layer", "common/stack_layer.cl" },
+ { "strided_slice", "common/slice_ops.cl" },
+ { "tile", "common/tile.cl" },
+ { "transpose", "common/transpose.cl" },
+ { "upsample_layer_nchw", "nchw/upsample_layer.cl" },
+ { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" },
+ { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" },
+ { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" },
+ { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" },
+ { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" },
};
const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
{
#ifdef EMBEDDED_KERNELS
{
- "activation_layer.cl",
-#include "./cl_kernels/activation_layer.clembed"
+ "common/activation_layer.cl",
+#include "./cl_kernels/common/activation_layer.clembed"
},
{
- "activation_layer_quant.cl",
-#include "./cl_kernels/activation_layer_quant.clembed"
+ "common/activation_layer_quant.cl",
+#include "./cl_kernels/common/activation_layer_quant.clembed"
},
{
- "arg_min_max.cl",
-#include "./cl_kernels/arg_min_max.clembed"
+ "common/arg_min_max.cl",
+#include "./cl_kernels/common/arg_min_max.clembed"
},
{
- "batch_to_space.cl",
-#include "./cl_kernels/batch_to_space.clembed"
+ "nchw/batch_to_space.cl",
+#include "./cl_kernels/nchw/batch_to_space.clembed"
},
{
- "bitwise_op.cl",
-#include "./cl_kernels/bitwise_op.clembed"
+ "nhwc/batch_to_space.cl",
+#include "./cl_kernels/nhwc/batch_to_space.clembed"
},
{
- "bounding_box_transform.cl",
-#include "./cl_kernels/bounding_box_transform.clembed"
+ "common/bitwise_op.cl",
+#include "./cl_kernels/common/bitwise_op.clembed"
},
{
- "bounding_box_transform_quantized.cl",
-#include "./cl_kernels/bounding_box_transform_quantized.clembed"
+ "common/bounding_box_transform.cl",
+#include "./cl_kernels/common/bounding_box_transform.clembed"
},
{
- "channel_shuffle.cl",
-#include "./cl_kernels/channel_shuffle.clembed"
+ "common/bounding_box_transform_quantized.cl",
+#include "./cl_kernels/common/bounding_box_transform_quantized.clembed"
},
{
- "col2im.cl",
-#include "./cl_kernels/col2im.clembed"
+ "nchw/channel_shuffle.cl",
+#include "./cl_kernels/nchw/channel_shuffle.clembed"
},
{
- "comparisons.cl",
-#include "./cl_kernels/comparisons.clembed"
+ "nhwc/channel_shuffle.cl",
+#include "./cl_kernels/nhwc/channel_shuffle.clembed"
},
{
- "concatenate.cl",
-#include "./cl_kernels/concatenate.clembed"
+ "common/col2im.cl",
+#include "./cl_kernels/common/col2im.clembed"
},
{
- "convert_fc_weights.cl",
-#include "./cl_kernels/convert_fc_weights.clembed"
+ "common/comparisons.cl",
+#include "./cl_kernels/common/comparisons.clembed"
},
{
- "convolution_layer.cl",
-#include "./cl_kernels/convolution_layer.clembed"
+ "common/concatenate.cl",
+#include "./cl_kernels/common/concatenate.clembed"
},
{
- "copy_tensor.cl",
-#include "./cl_kernels/copy_tensor.clembed"
+ "common/convert_fc_weights.cl",
+#include "./cl_kernels/common/convert_fc_weights.clembed"
},
{
- "crop_tensor.cl",
-#include "./cl_kernels/crop_tensor.clembed"
+ "common/convolution_layer.cl",
+#include "./cl_kernels/common/convolution_layer.clembed"
},
{
- "upsample_layer.cl",
-#include "./cl_kernels/upsample_layer.clembed"
+ "common/copy_tensor.cl",
+#include "./cl_kernels/common/copy_tensor.clembed"
},
{
- "deconvolution_layer.cl",
-#include "./cl_kernels/deconvolution_layer.clembed"
+ "common/crop_tensor.cl",
+#include "./cl_kernels/common/crop_tensor.clembed"
},
{
- "cast.cl",
-#include "./cl_kernels/cast.clembed"
+ "nchw/upsample_layer.cl",
+#include "./cl_kernels/nchw/upsample_layer.clembed"
},
{
- "depth_to_space.cl",
-#include "./cl_kernels/depth_to_space.clembed"
+ "nhwc/upsample_layer.cl",
+#include "./cl_kernels/nhwc/upsample_layer.clembed"
},
{
- "dequantization_layer.cl",
-#include "./cl_kernels/dequantization_layer.clembed"
+ "common/deconvolution_layer.cl",
+#include "./cl_kernels/common/deconvolution_layer.clembed"
},
{
- "direct_convolution1x1.cl",
-#include "./cl_kernels/direct_convolution1x1.clembed"
+ "common/cast.cl",
+#include "./cl_kernels/common/cast.clembed"
},
{
- "direct_convolution3x3.cl",
-#include "./cl_kernels/direct_convolution3x3.clembed"
+ "nchw/depth_to_space.cl",
+#include "./cl_kernels/nchw/depth_to_space.clembed"
},
{
- "direct_convolution5x5.cl",
-#include "./cl_kernels/direct_convolution5x5.clembed"
+ "nhwc/depth_to_space.cl",
+#include "./cl_kernels/nhwc/depth_to_space.clembed"
},
{
- "direct_convolution_quantized.cl",
-#include "./cl_kernels/direct_convolution_quantized.clembed"
+ "common/dequantization_layer.cl",
+#include "./cl_kernels/common/dequantization_layer.clembed"
},
{
- "direct_convolution.cl",
-#include "./cl_kernels/direct_convolution.clembed"
+ "nchw/dequantization_layer.cl",
+#include "./cl_kernels/nchw/dequantization_layer.clembed"
},
{
- "dwc_native_fp_nhwc.cl",
-#include "./cl_kernels/dwc_native_fp_nhwc.clembed"
+ "nhwc/dequantization_layer.cl",
+#include "./cl_kernels/nhwc/dequantization_layer.clembed"
},
{
- "dwc_native_quantized_nhwc.cl",
-#include "./cl_kernels/dwc_native_quantized_nhwc.clembed"
+ "nchw/direct_convolution1x1.cl",
+#include "./cl_kernels/nchw/direct_convolution1x1.clembed"
},
{
- "elementwise_operation.cl",
-#include "./cl_kernels/elementwise_operation.clembed"
+ "nchw/direct_convolution3x3.cl",
+#include "./cl_kernels/nchw/direct_convolution3x3.clembed"
},
{
- "elementwise_operation_quantized.cl",
-#include "./cl_kernels/elementwise_operation_quantized.clembed"
+ "nchw/direct_convolution5x5.cl",
+#include "./cl_kernels/nchw/direct_convolution5x5.clembed"
},
{
- "elementwise_unary.cl",
-#include "./cl_kernels/elementwise_unary.clembed"
+ "nchw/direct_convolution_quantized.cl",
+#include "./cl_kernels/nchw/direct_convolution_quantized.clembed"
},
{
- "fft.cl",
-#include "./cl_kernels/fft.clembed"
+ "nhwc/direct_convolution.cl",
+#include "./cl_kernels/nhwc/direct_convolution.clembed"
},
{
- "fft_digit_reverse.cl",
-#include "./cl_kernels/fft_digit_reverse.clembed"
+ "nhwc/dwc_native_fp_nhwc.cl",
+#include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed"
},
{
- "fft_scale.cl",
-#include "./cl_kernels/fft_scale.clembed"
+ "nhwc/dwc_native_quantized_nhwc.cl",
+#include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed"
},
{
- "fill_border.cl",
-#include "./cl_kernels/fill_border.clembed"
+ "common/elementwise_operation.cl",
+#include "./cl_kernels/common/elementwise_operation.clembed"
},
{
- "floor.cl",
-#include "./cl_kernels/floor.clembed"
+ "common/elementwise_operation_quantized.cl",
+#include "./cl_kernels/common/elementwise_operation_quantized.clembed"
},
{
- "gather.cl",
-#include "./cl_kernels/gather.clembed"
+ "common/elementwise_unary.cl",
+#include "./cl_kernels/common/elementwise_unary.clembed"
},
{
- "gemm.cl",
-#include "./cl_kernels/gemm.clembed"
+ "common/fft.cl",
+#include "./cl_kernels/common/fft.clembed"
},
{
- "gemm_v1.cl",
-#include "./cl_kernels/gemm_v1.clembed"
+ "common/fft_digit_reverse.cl",
+#include "./cl_kernels/common/fft_digit_reverse.clembed"
},
{
- "gemmlowp.cl",
-#include "./cl_kernels/gemmlowp.clembed"
+ "common/fft_scale.cl",
+#include "./cl_kernels/common/fft_scale.clembed"
},
{
- "gemv.cl",
-#include "./cl_kernels/gemv.clembed"
+ "common/fill_border.cl",
+#include "./cl_kernels/common/fill_border.clembed"
},
{
- "generate_proposals.cl",
-#include "./cl_kernels/generate_proposals.clembed"
+ "common/floor.cl",
+#include "./cl_kernels/common/floor.clembed"
},
{
- "generate_proposals_quantized.cl",
-#include "./cl_kernels/generate_proposals_quantized.clembed"
+ "common/gather.cl",
+#include "./cl_kernels/common/gather.clembed"
+ },
+ {
+ "common/gemm.cl",
+#include "./cl_kernels/common/gemm.clembed"
+ },
+ {
+ "common/gemm_v1.cl",
+#include "./cl_kernels/common/gemm_v1.clembed"
+ },
+ {
+ "common/gemmlowp.cl",
+#include "./cl_kernels/common/gemmlowp.clembed"
+ },
+ {
+ "common/gemv.cl",
+#include "./cl_kernels/common/gemv.clembed"
+ },
+ {
+ "common/generate_proposals.cl",
+#include "./cl_kernels/common/generate_proposals.clembed"
+ },
+ {
+ "common/generate_proposals_quantized.cl",
+#include "./cl_kernels/common/generate_proposals_quantized.clembed"
},
{
"helpers.h",
@@ -665,184 +689,256 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
#include "./cl_kernels/helpers_asymm.hembed"
},
{
- "im2col.cl",
-#include "./cl_kernels/im2col.clembed"
+ "nchw/im2col.cl",
+#include "./cl_kernels/nchw/im2col.clembed"
+ },
+ {
+ "nhwc/im2col.cl",
+#include "./cl_kernels/nhwc/im2col.clembed"
+ },
+ {
+ "common/instance_normalization.cl",
+#include "./cl_kernels/common/instance_normalization.clembed"
+ },
+ {
+ "common/l2_normalize.cl",
+#include "./cl_kernels/common/l2_normalize.clembed"
+ },
+ {
+ "common/mean_stddev_normalization.cl",
+#include "./cl_kernels/common/mean_stddev_normalization.clembed"
+ },
+ {
+ "common/memset.cl",
+#include "./cl_kernels/common/memset.clembed"
+ },
+ {
+ "common/minmax_layer.cl",
+#include "./cl_kernels/common/minmax_layer.clembed"
},
{
- "instance_normalization.cl",
-#include "./cl_kernels/instance_normalization.clembed"
+ "common/nonmax.cl",
+#include "./cl_kernels/common/nonmax.clembed"
},
{
- "l2_normalize.cl",
-#include "./cl_kernels/l2_normalize.clembed"
+ "nchw/normalization_layer.cl",
+#include "./cl_kernels/nchw/normalization_layer.clembed"
},
{
- "mean_stddev_normalization.cl",
-#include "./cl_kernels/mean_stddev_normalization.clembed"
+ "nhwc/normalization_layer.cl",
+#include "./cl_kernels/nhwc/normalization_layer.clembed"
},
{
- "memset.cl",
-#include "./cl_kernels/memset.clembed"
+ "nchw/normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed"
},
{
- "minmax_layer.cl",
-#include "./cl_kernels/minmax_layer.clembed"
+ "nhwc/normalize_planar_yuv_layer.cl",
+#include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed"
},
{
- "nonmax.cl",
-#include "./cl_kernels/nonmax.clembed"
+ "nchw/normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed"
},
{
- "normalization_layer.cl",
-#include "./cl_kernels/normalization_layer.clembed"
+ "nhwc/normalize_planar_yuv_layer_quantized.cl",
+#include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed"
},
{
- "normalize_planar_yuv_layer.cl",
-#include "./cl_kernels/normalize_planar_yuv_layer.clembed"
+ "common/batchnormalization_layer.cl",
+#include "./cl_kernels/common/batchnormalization_layer.clembed"
},
{
- "normalize_planar_yuv_layer_quantized.cl",
-#include "./cl_kernels/normalize_planar_yuv_layer_quantized.clembed"
+ "nchw/batchnormalization_layer.cl",
+#include "./cl_kernels/nchw/batchnormalization_layer.clembed"
},
{
- "batchnormalization_layer.cl",
-#include "./cl_kernels/batchnormalization_layer.clembed"
+ "nhwc/batchnormalization_layer.cl",
+#include "./cl_kernels/nhwc/batchnormalization_layer.clembed"
},
{
- "pad_layer.cl",
-#include "./cl_kernels/pad_layer.clembed"
+ "common/pad_layer.cl",
+#include "./cl_kernels/common/pad_layer.clembed"
},
{
- "permute.cl",
-#include "./cl_kernels/permute.clembed"
+ "common/permute.cl",
+#include "./cl_kernels/common/permute.clembed"
},
{
- "pixelwise_mul_float.cl",
-#include "./cl_kernels/pixelwise_mul_float.clembed"
+ "common/pixelwise_mul_float.cl",
+#include "./cl_kernels/common/pixelwise_mul_float.clembed"
},
{
- "pixelwise_mul_int.cl",
-#include "./cl_kernels/pixelwise_mul_int.clembed"
+ "common/pixelwise_mul_int.cl",
+#include "./cl_kernels/common/pixelwise_mul_int.clembed"
},
{
- "pooling_layer.cl",
-#include "./cl_kernels/pooling_layer.clembed"
+ "common/pooling_layer.cl",
+#include "./cl_kernels/common/pooling_layer.clembed"
},
{
- "pooling_layer_quantized.cl",
-#include "./cl_kernels/pooling_layer_quantized.clembed"
+ "nchw/pooling_layer.cl",
+#include "./cl_kernels/nchw/pooling_layer.clembed"
},
{
- "prior_box_layer.cl",
-#include "./cl_kernels/prior_box_layer.clembed"
+ "nhwc/pooling_layer.cl",
+#include "./cl_kernels/nhwc/pooling_layer.clembed"
},
{
- "qlstm_layer_normalization.cl",
-#include "./cl_kernels/qlstm_layer_normalization.clembed"
+ "nchw/pooling_layer_quantized.cl",
+#include "./cl_kernels/nchw/pooling_layer_quantized.clembed"
},
{
- "quantization_layer.cl",
-#include "./cl_kernels/quantization_layer.clembed"
+ "nhwc/pooling_layer_quantized.cl",
+#include "./cl_kernels/nhwc/pooling_layer_quantized.clembed"
},
{
- "range.cl",
-#include "./cl_kernels/range.clembed"
+ "nchw/prior_box_layer.cl",
+#include "./cl_kernels/nchw/prior_box_layer.clembed"
},
{
- "reduction_operation.cl",
-#include "./cl_kernels/reduction_operation.clembed"
+ "common/qlstm_layer_normalization.cl",
+#include "./cl_kernels/common/qlstm_layer_normalization.clembed"
},
{
- "remap.cl",
-#include "./cl_kernels/remap.clembed"
+ "common/quantization_layer.cl",
+#include "./cl_kernels/common/quantization_layer.clembed"
},
{
- "reorg_layer.cl",
-#include "./cl_kernels/reorg_layer.clembed"
+ "common/range.cl",
+#include "./cl_kernels/common/range.clembed"
},
{
- "reshape_layer.cl",
-#include "./cl_kernels/reshape_layer.clembed"
+ "common/reduction_operation.cl",
+#include "./cl_kernels/common/reduction_operation.clembed"
},
{
- "reverse.cl",
-#include "./cl_kernels/reverse.clembed"
+ "nchw/remap.cl",
+#include "./cl_kernels/nchw/remap.clembed"
},
{
- "roi_align_layer.cl",
-#include "./cl_kernels/roi_align_layer.clembed"
+ "nhwc/remap.cl",
+#include "./cl_kernels/nhwc/remap.clembed"
},
{
- "roi_align_layer_quantized.cl",
-#include "./cl_kernels/roi_align_layer_quantized.clembed"
+ "nchw/reorg_layer.cl",
+#include "./cl_kernels/nchw/reorg_layer.clembed"
},
{
- "roi_pooling_layer.cl",
-#include "./cl_kernels/roi_pooling_layer.clembed"
+ "nhwc/reorg_layer.cl",
+#include "./cl_kernels/nhwc/reorg_layer.clembed"
},
{
- "scale.cl",
-#include "./cl_kernels/scale.clembed"
+ "common/reshape_layer.cl",
+#include "./cl_kernels/common/reshape_layer.clembed"
},
{
- "scale_quantized.cl",
-#include "./cl_kernels/scale_quantized.clembed"
+ "common/reverse.cl",
+#include "./cl_kernels/common/reverse.clembed"
},
{
- "select.cl",
-#include "./cl_kernels/select.clembed"
+ "common/roi_align_layer.cl",
+#include "./cl_kernels/common/roi_align_layer.clembed"
},
{
- "softmax_layer.cl",
-#include "./cl_kernels/softmax_layer.clembed"
+ "common/roi_align_layer_quantized.cl",
+#include "./cl_kernels/common/roi_align_layer_quantized.clembed"
},
{
- "softmax_layer_quantized.cl",
-#include "./cl_kernels/softmax_layer_quantized.clembed"
+ "common/roi_pooling_layer.cl",
+#include "./cl_kernels/common/roi_pooling_layer.clembed"
},
{
- "slice_ops.cl",
-#include "./cl_kernels/slice_ops.clembed"
+ "nchw/scale.cl",
+#include "./cl_kernels/nchw/scale.clembed"
},
{
- "space_to_batch.cl",
-#include "./cl_kernels/space_to_batch.clembed"
+ "nhwc/scale.cl",
+#include "./cl_kernels/nhwc/scale.clembed"
},
{
- "space_to_depth.cl",
-#include "./cl_kernels/space_to_depth.clembed"
+ "nchw/scale_quantized.cl",
+#include "./cl_kernels/nchw/scale_quantized.clembed"
},
{
- "stack_layer.cl",
-#include "./cl_kernels/stack_layer.clembed"
+ "nhwc/scale_quantized.cl",
+#include "./cl_kernels/nhwc/scale_quantized.clembed"
},
{
- "tile.cl",
-#include "./cl_kernels/tile.clembed"
+ "common/select.cl",
+#include "./cl_kernels/common/select.clembed"
},
{
- "transpose.cl",
-#include "./cl_kernels/transpose.clembed"
+ "common/softmax_layer.cl",
+#include "./cl_kernels/common/softmax_layer.clembed"
+ },
+ {
+ "common/softmax_layer_quantized.cl",
+#include "./cl_kernels/common/softmax_layer_quantized.clembed"
+ },
+ {
+ "common/slice_ops.cl",
+#include "./cl_kernels/common/slice_ops.clembed"
+ },
+ {
+ "nchw/space_to_batch.cl",
+#include "./cl_kernels/nchw/space_to_batch.clembed"
+ },
+ {
+ "nhwc/space_to_batch.cl",
+#include "./cl_kernels/nhwc/space_to_batch.clembed"
+ },
+ {
+ "nchw/space_to_depth.cl",
+#include "./cl_kernels/nchw/space_to_depth.clembed"
+ },
+ {
+ "nhwc/space_to_depth.cl",
+#include "./cl_kernels/nhwc/space_to_depth.clembed"
+ },
+ {
+ "common/stack_layer.cl",
+#include "./cl_kernels/common/stack_layer.clembed"
+ },
+ {
+ "common/tile.cl",
+#include "./cl_kernels/common/tile.clembed"
+ },
+ {
+ "common/transpose.cl",
+#include "./cl_kernels/common/transpose.clembed"
},
{
"types.h",
#include "./cl_kernels/types.hembed"
},
{
- "unpooling_layer.cl",
-#include "./cl_kernels/unpooling_layer.clembed"
+ "common/unpooling_layer.cl",
+#include "./cl_kernels/common/unpooling_layer.clembed"
+ },
+ {
+ "nchw/winograd_filter_transform.cl",
+#include "./cl_kernels/nchw/winograd_filter_transform.clembed"
+ },
+ {
+ "nhwc/winograd_filter_transform.cl",
+#include "./cl_kernels/nhwc/winograd_filter_transform.clembed"
+ },
+ {
+ "nchw/winograd_input_transform.cl",
+#include "./cl_kernels/nchw/winograd_input_transform.clembed"
},
{
- "winograd_filter_transform.cl",
-#include "./cl_kernels/winograd_filter_transform.clembed"
+ "nhwc/winograd_input_transform.cl",
+#include "./cl_kernels/nhwc/winograd_input_transform.clembed"
},
{
- "winograd_input_transform.cl",
-#include "./cl_kernels/winograd_input_transform.clembed"
+ "nchw/winograd_output_transform.cl",
+#include "./cl_kernels/nchw/winograd_output_transform.clembed"
},
{
- "winograd_output_transform.cl",
-#include "./cl_kernels/winograd_output_transform.clembed"
+ "nhwc/winograd_output_transform.cl",
+#include "./cl_kernels/nhwc/winograd_output_transform.clembed"
},
#endif /* EMBEDDED_KERNELS */
};