diff options
Diffstat (limited to 'src/gpu')
182 files changed, 28436 insertions, 0 deletions
diff --git a/src/gpu/cl/ClCompileContext.h b/src/gpu/cl/ClCompileContext.h new file mode 100644 index 0000000000..e69cc0200f --- /dev/null +++ b/src/gpu/cl/ClCompileContext.h @@ -0,0 +1,36 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_COMPILE_CONTEXT_H +#define ARM_COMPUTE_CL_COMPILE_CONTEXT_H + +#include "arm_compute/core/CL/CLCompileContext.h" + +namespace arm_compute +{ +namespace opencl +{ +using ClCompileContext = arm_compute::CLCompileContext; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_COMPILE_CONTEXT_H */ diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp new file mode 100644 index 0000000000..5cd969e7f2 --- /dev/null +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -0,0 +1,1029 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/ClKernelLibrary.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Utils.h" + +#include <algorithm> +#include <array> +#include <fstream> +#include <utility> + +#ifdef ARM_COMPUTE_COMPRESSED_KERNELS +#include <zlib.h> + +namespace +{ +/* Decoding table */ +constexpr std::array<uint8_t, 256> b64_invtab = +{ + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, + 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0, + 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, + 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, + 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, + 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +}; + +/** Decode a base64 encoded string + * + * @param[in] str Base64 encoded string to decode + * + * @return The decode string in case of a valid, non-empty string otherwise an empty string + */ +std::string decode_base64(const std::string &str) +{ + constexpr const char pad_char = '='; + + // Handle empty string + if(str.empty()) + { + return {}; + } + + // Base64 encoded string has size multiple of 4 + if(str.length() % 4) + { + return {}; + } + + // + // Check encoded string padding + std::size_t padding = (str.rbegin()[0] == pad_char) + (str.rbegin()[1] == pad_char); + const int str_len = str.size(); + + // Reserve memory for the decoded string + // Note each 4 consecutive elements of 6-bit encode 3 bytes + std::string dec_b64; + dec_b64.reserve(((str_len / 4) * 3)); + + // Block decoding function (exclude padding) + int c = 0; + const int end = str_len - 4 - padding; + for(; c <= end; c += 4) + { + const int byte0 = b64_invtab[str[c]]; + const int byte1 = b64_invtab[str[c + 1]]; + const int byte2 = b64_invtab[str[c + 2]]; + const int byte3 = b64_invtab[str[c + 3]]; + + dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); + dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); + dec_b64.push_back((byte2 << 6) | (byte3)); + } + + // Last step that might contain padding symbols + if(padding == 1) + { + const int byte0 = b64_invtab[str[c]]; + const int byte1 = b64_invtab[str[c + 1]]; + const int byte2 = b64_invtab[str[c + 2]]; + + dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); + dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); + } + else if(padding == 2) + { + const int byte0 = b64_invtab[str[c]]; + const int byte1 = b64_invtab[str[c + 1]]; + + dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); + } + + return dec_b64; +} + +/** Decompress a zlib compressed string + * + * @param[in] str ZLib compressed string + * + * @return The decompressed string if successful, otherwise false. + */ +std::string decompress_zlib(const std::string &str) +{ + // Create and initialize decompression stream + z_stream ds{}; + if(inflateInit(&ds) != Z_OK) + { + return std::string(); + } + ds.avail_in = str.size(); + ds.next_in = (Bytef *)str.data(); + + // Roll-over the string using a buffer and decompress + int status = Z_OK; + char roll_buff[16384]; + std::string inflated_str; + do + { + ds.avail_out = sizeof(roll_buff); + ds.next_out = reinterpret_cast<Bytef *>(roll_buff); + + status = inflate(&ds, 0); + if(inflated_str.size() < ds.total_out) + { + inflated_str.append(roll_buff, ds.total_out - inflated_str.size()); + } + } + while(status == Z_OK); + + // Finalize decompression stream + inflateEnd(&ds); + if(status != Z_STREAM_END) + { + return std::string(); + } + + return inflated_str; +} +} // namespace +#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ + +namespace arm_compute +{ +namespace opencl +{ +const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = +{ + // Common Kernels + { "activation_layer", "common/activation_layer.cl" }, + { "activation_layer_quant", "common/activation_layer_quant.cl" }, + { "activation_layer_quant_f32", "common/activation_layer_quant.cl" }, + { "arg_min_max_x", "common/arg_min_max.cl" }, + { "arg_min_max_y", "common/arg_min_max.cl" }, + { "arg_min_max_z", "common/arg_min_max.cl" }, + { "arg_min_max_w", "common/arg_min_max.cl" }, + { "bitwise_or", "common/bitwise_op.cl" }, + { "bitwise_and", "common/bitwise_op.cl" }, + { "bitwise_xor", "common/bitwise_op.cl" }, + { "bitwise_not", "common/bitwise_op.cl" }, + { "bounding_box_transform", "common/bounding_box_transform.cl" }, + { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" }, + { "compare_equal", "common/comparisons.cl" }, + { "compare_equal_quantized", "common/comparisons.cl" }, + { "compare_notequal", "common/comparisons.cl" }, + { "compare_notequal_quantized", "common/comparisons.cl" }, + { "compare_greater", "common/comparisons.cl" }, + { "compare_greater_quantized", "common/comparisons.cl" }, + { "compare_greaterequal", "common/comparisons.cl" }, + { "compare_greaterequal_quantized", "common/comparisons.cl" }, + { "compare_less", "common/comparisons.cl" }, + { "compare_less_quantized", "common/comparisons.cl" }, + { "compare_lessequal", "common/comparisons.cl" }, + { "compare_lessequal_quantized", "common/comparisons.cl" }, + { "concatenate", "common/concatenate.cl" }, + { "concatenate_width", "common/concatenate.cl" }, + { "concatenate_height", "common/concatenate.cl" }, + { "concatenate_width_x2", "common/concatenate.cl" }, + { "concatenate_width_x4", "common/concatenate.cl" }, + { "col2im", "common/col2im.cl" }, + { "cast_down", "common/cast.cl" }, + { "cast_up", "common/cast.cl" }, + { "convert_fc_weights", "common/convert_fc_weights.cl" }, + { "copy_tensor", "common/copy_tensor.cl" }, + { "crop_tensor", "common/crop_tensor.cl" }, + { "deconvolution_reshape", "common/deconvolution_layer.cl" }, + { "deconvolution_upsample", "common/deconvolution_layer.cl" }, + { "dequantization_layer", "common/dequantization_layer.cl" }, + { "elementwise_operation_ADD", "common/elementwise_operation.cl" }, + { "elementwise_operation_SUB", "common/elementwise_operation.cl" }, + { "elementwise_operation_MAX", "common/elementwise_operation.cl" }, + { "elementwise_operation_MIN", "common/elementwise_operation.cl" }, + { "elementwise_operation_DIV", "common/elementwise_operation.cl" }, + { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" }, + { "elementwise_operation_POWER", "common/elementwise_operation.cl" }, + { "elementwise_operation_PRELU", "common/elementwise_operation.cl" }, + { "elementwise_operation_AND", "common/elementwise_operation.cl" }, + { "elementwise_operation_OR", "common/elementwise_operation.cl" }, + { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" }, + { "elementwise_unary", "common/elementwise_unary.cl" }, + { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" }, + { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" }, + { "fft_radix_2_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_2_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_2_axis_0", "common/fft.cl" }, + { "fft_radix_2_axis_1", "common/fft.cl" }, + { "fft_radix_3_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_3_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_3_axis_0", "common/fft.cl" }, + { "fft_radix_3_axis_1", "common/fft.cl" }, + { "fft_radix_4_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_4_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_4_axis_0", "common/fft.cl" }, + { "fft_radix_4_axis_1", "common/fft.cl" }, + { "fft_radix_5_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_5_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_5_axis_0", "common/fft.cl" }, + { "fft_radix_5_axis_1", "common/fft.cl" }, + { "fft_radix_7_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_7_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_7_axis_0", "common/fft.cl" }, + { "fft_radix_7_axis_1", "common/fft.cl" }, + { "fft_radix_8_first_stage_axis_0", "common/fft.cl" }, + { "fft_radix_8_first_stage_axis_1", "common/fft.cl" }, + { "fft_radix_8_axis_0", "common/fft.cl" }, + { "fft_radix_8_axis_1", "common/fft.cl" }, + { "fft_scale_conj", "common/fft_scale.cl" }, + { "fill_image_borders_constant", "common/fill_border.cl" }, + { "fill_image_borders_replicate", "common/fill_border.cl" }, + { "floor_layer", "common/floor.cl" }, + { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" }, + { "gather", "common/gather.cl" }, + { "gemm_ma_f16", "common/gemm.cl" }, + { "gemm_ma_f32", "common/gemm.cl" }, + { "gemm_mv", "common/gemv.cl" }, + { "gemm_mv_quantized", "common/gemv.cl" }, + { "gemm_mm_interleaved_transposed_f16", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f16_acc32", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f16_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f32", "common/gemm_v1.cl" }, + { "gemm_mm_interleaved_transposed_f32_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f16_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f16_bifrost_acc32", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f32_bifrost", "common/gemm_v1.cl" }, + { "gemm_mm_floating_point_f32_bifrost_1000", "common/gemm_v1.cl" }, + { "gemm_mm_native", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" }, + { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" }, + { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" }, + { "gemm_lc_vm_f32", "common/gemm.cl" }, + { "gemm_reshape_lhs_matrix_nt", "common/gemm.cl" }, + { "gemm_reshape_lhs_matrix_t", "common/gemm.cl" }, + { "gemm_reshape_rhs_matrix_nt", "common/gemm.cl" }, + { "gemm_reshape_rhs_matrix_t", "common/gemm.cl" }, + { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" }, + { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" }, + { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" }, + { "gemmlowp_mm_native", "common/gemmlowp.cl" }, + { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" }, + { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" }, + { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" }, + { "gemmlowp_offset_contribution", "common/gemmlowp.cl" }, + { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" }, + { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" }, + { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" }, + { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" }, + { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" }, + { "instance_normalization", "common/instance_normalization.cl" }, + { "compute_mean_var", "common/instance_normalization.cl" }, + { "l2_normalize_x", "common/l2_normalize.cl" }, + { "l2_normalize_y", "common/l2_normalize.cl" }, + { "l2_normalize_z", "common/l2_normalize.cl" }, + { "max_unpooling_layer_2", "common/unpooling_layer.cl" }, + { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" }, + { "memset", "common/memset.cl" }, + { "minmax_layer", "common/minmax_layer.cl" }, + { "non_max_suppression", "common/nonmax.cl" }, + { "pad_layer_constant", "common/pad_layer.cl" }, + { "pad_layer_symmetric_reflect", "common/pad_layer.cl" }, + { "permute", "common/permute.cl" }, + { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" }, + { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" }, + { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" }, + { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" }, + { "pooling_layer_2", "common/pooling_layer.cl" }, + { "pooling_layer_3", "common/pooling_layer.cl" }, + { "pooling_layer_optimized_3", "common/pooling_layer.cl" }, + { "pooling_layer_7", "common/pooling_layer.cl" }, + { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" }, + { "quantization_layer", "common/quantization_layer.cl" }, + { "range", "common/range.cl" }, + { "range_quantized", "common/range.cl" }, + { "reduction_operation_x", "common/reduction_operation.cl" }, + { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" }, + { "reduction_operation_y", "common/reduction_operation.cl" }, + { "reduction_operation_z", "common/reduction_operation.cl" }, + { "reduction_operation_w", "common/reduction_operation.cl" }, + { "reshape_layer", "common/reshape_layer.cl" }, + { "reshape_to_columns", "common/convolution_layer.cl" }, + { "reverse", "common/reverse.cl" }, + { "roi_align_layer", "common/roi_align_layer.cl" }, + { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" }, + { "roi_pooling_layer", "common/roi_pooling_layer.cl" }, + { "select_same_rank", "common/select.cl" }, + { "select_different_rank_2", "common/select.cl" }, + { "select_different_rank_n", "common/select.cl" }, + { "softmax_layer_norm", "common/softmax_layer.cl" }, + { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" }, + { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" }, + { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" }, + { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" }, + { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" }, + { "stack_layer", "common/stack_layer.cl" }, + { "strided_slice", "common/slice_ops.cl" }, + { "tile", "common/tile.cl" }, + { "transpose", "common/transpose.cl" }, +#ifdef ENABLE_NCHW_KERNELS + { "batch_to_space_nchw", "nchw/batch_to_space.cl" }, + { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" }, + { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" }, + { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" }, + { "depth_to_space_nchw", "nchw/depth_to_space.cl" }, + { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" }, + { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" }, + { "direct_convolution1x1_f32_bifrost", "nchw/direct_convolution1x1.cl" }, + { "direct_convolution3x3", "nchw/direct_convolution3x3.cl" }, + { "direct_convolution3x3_f32_bifrost", "nchw/direct_convolution3x3.cl" }, + { "direct_convolution5x5", "nchw/direct_convolution5x5.cl" }, + { "direct_convolution5x5_f32_bifrost", "nchw/direct_convolution5x5.cl" }, + { "direct_convolution_quantized", "nchw/direct_convolution_quantized.cl" }, + { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" }, + { "im2col3x3_nchw", "nchw/im2col.cl" }, + { "im2col5x5_nchw", "nchw/im2col.cl" }, + { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" }, + { "im2col_generic_nchw", "nchw/im2col.cl" }, + { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" }, + { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" }, + { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" }, + { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" }, + { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" }, + { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" }, + { "pooling_layer_2_nchw_indices_fp32", "nchw/pooling_layer.cl" }, + { "pooling_layer_2_nchw_indices_fp16", "nchw/pooling_layer.cl" }, + { "pooling_layer_MxN_quantized_nchw", "nchw/pooling_layer_quantized.cl" }, + { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" }, + { "remap_nearest_neighbour_nchw", "nchw/remap.cl" }, + { "remap_bilinear_nchw", "nchw/remap.cl" }, + { "reorg_layer_nchw", "nchw/reorg_layer.cl" }, + { "scale_nearest_neighbour_nchw", "nchw/scale.cl" }, + { "scale_bilinear_nchw", "nchw/scale.cl" }, + { "space_to_batch_nchw", "nchw/space_to_batch.cl" }, + { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" }, + { "space_to_depth_nchw", "nchw/space_to_depth.cl" }, + { "upsample_layer_nchw", "nchw/upsample_layer.cl" }, + { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" }, + { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, + { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" }, +#endif /* ENABLE_NCHW_KERNELS */ +#ifdef ENABLE_NHWC_KERNELS + { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" }, + { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" }, + { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" }, + { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" }, + { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" }, + { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" }, + { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" }, + { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" }, + { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" }, + { "im2col3x3_nhwc", "nhwc/im2col.cl" }, + { "im2col9x9_nhwc", "nhwc/im2col.cl" }, + { "im2col_generic_nhwc", "nhwc/im2col.cl" }, + { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" }, + { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" }, + { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" }, + { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" }, + { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" }, + { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" }, + { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" }, + { "remap_nearest_neighbour_nhwc", "nhwc/remap.cl" }, + { "remap_bilinear_nhwc", "nhwc/remap.cl" }, + { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" }, + { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" }, + { "scale_bilinear_nhwc", "nhwc/scale.cl" }, + { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" }, + { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" }, + { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" }, + { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" }, + { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" }, + { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, + { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" }, + { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" }, +#endif /* ENABLE_NHWC_KERNELS */ +}; + +const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = +{ +#ifdef EMBEDDED_KERNELS + { + "common/activation_layer.cl", +#include "./cl_kernels/common/activation_layer.clembed" + }, + { + "common/activation_layer_quant.cl", +#include "./cl_kernels/common/activation_layer_quant.clembed" + }, + { + "common/arg_min_max.cl", +#include "./cl_kernels/common/arg_min_max.clembed" + }, + { + "common/bitwise_op.cl", +#include "./cl_kernels/common/bitwise_op.clembed" + }, + { + "common/bounding_box_transform.cl", +#include "./cl_kernels/common/bounding_box_transform.clembed" + }, + { + "common/bounding_box_transform_quantized.cl", +#include "./cl_kernels/common/bounding_box_transform_quantized.clembed" + }, + { + "common/col2im.cl", +#include "./cl_kernels/common/col2im.clembed" + }, + { + "common/comparisons.cl", +#include "./cl_kernels/common/comparisons.clembed" + }, + { + "common/concatenate.cl", +#include "./cl_kernels/common/concatenate.clembed" + }, + { + "common/convert_fc_weights.cl", +#include "./cl_kernels/common/convert_fc_weights.clembed" + }, + { + "common/convolution_layer.cl", +#include "./cl_kernels/common/convolution_layer.clembed" + }, + { + "common/copy_tensor.cl", +#include "./cl_kernels/common/copy_tensor.clembed" + }, + { + "common/crop_tensor.cl", +#include "./cl_kernels/common/crop_tensor.clembed" + }, + { + "common/deconvolution_layer.cl", +#include "./cl_kernels/common/deconvolution_layer.clembed" + }, + { + "common/cast.cl", +#include "./cl_kernels/common/cast.clembed" + }, + { + "common/dequantization_layer.cl", +#include "./cl_kernels/common/dequantization_layer.clembed" + }, + { + "common/elementwise_operation.cl", +#include "./cl_kernels/common/elementwise_operation.clembed" + }, + { + "common/elementwise_operation_quantized.cl", +#include "./cl_kernels/common/elementwise_operation_quantized.clembed" + }, + { + "common/elementwise_unary.cl", +#include "./cl_kernels/common/elementwise_unary.clembed" + }, + { + "common/fft.cl", +#include "./cl_kernels/common/fft.clembed" + }, + { + "common/fft_digit_reverse.cl", +#include "./cl_kernels/common/fft_digit_reverse.clembed" + }, + { + "common/fft_scale.cl", +#include "./cl_kernels/common/fft_scale.clembed" + }, + { + "common/fill_border.cl", +#include "./cl_kernels/common/fill_border.clembed" + }, + { + "common/floor.cl", +#include "./cl_kernels/common/floor.clembed" + }, + { + "common/gather.cl", +#include "./cl_kernels/common/gather.clembed" + }, + { + "common/gemm.cl", +#include "./cl_kernels/common/gemm.clembed" + }, + { + "common/gemm_v1.cl", +#include "./cl_kernels/common/gemm_v1.clembed" + }, + { + "common/gemmlowp.cl", +#include "./cl_kernels/common/gemmlowp.clembed" + }, + { + "common/gemv.cl", +#include "./cl_kernels/common/gemv.clembed" + }, + { + "common/generate_proposals.cl", +#include "./cl_kernels/common/generate_proposals.clembed" + }, + { + "common/generate_proposals_quantized.cl", +#include "./cl_kernels/common/generate_proposals_quantized.clembed" + }, + { + "helpers.h", +#include "./cl_kernels/helpers.hembed" + }, + { + "helpers_asymm.h", +#include "./cl_kernels/helpers_asymm.hembed" + }, + { + "common/instance_normalization.cl", +#include "./cl_kernels/common/instance_normalization.clembed" + }, + { + "common/l2_normalize.cl", +#include "./cl_kernels/common/l2_normalize.clembed" + }, + { + "common/mean_stddev_normalization.cl", +#include "./cl_kernels/common/mean_stddev_normalization.clembed" + }, + { + "common/memset.cl", +#include "./cl_kernels/common/memset.clembed" + }, + { + "common/minmax_layer.cl", +#include "./cl_kernels/common/minmax_layer.clembed" + }, + { + "common/nonmax.cl", +#include "./cl_kernels/common/nonmax.clembed" + }, + { + "common/batchnormalization_layer.cl", +#include "./cl_kernels/common/batchnormalization_layer.clembed" + }, + { + "common/pad_layer.cl", +#include "./cl_kernels/common/pad_layer.clembed" + }, + { + "common/permute.cl", +#include "./cl_kernels/common/permute.clembed" + }, + { + "common/pixelwise_mul_float.cl", +#include "./cl_kernels/common/pixelwise_mul_float.clembed" + }, + { + "common/pixelwise_mul_int.cl", +#include "./cl_kernels/common/pixelwise_mul_int.clembed" + }, + { + "common/pooling_layer.cl", +#include "./cl_kernels/common/pooling_layer.clembed" + }, + { + "common/qlstm_layer_normalization.cl", +#include "./cl_kernels/common/qlstm_layer_normalization.clembed" + }, + { + "common/quantization_layer.cl", +#include "./cl_kernels/common/quantization_layer.clembed" + }, + { + "common/range.cl", +#include "./cl_kernels/common/range.clembed" + }, + { + "common/reduction_operation.cl", +#include "./cl_kernels/common/reduction_operation.clembed" + }, + { + "common/reshape_layer.cl", +#include "./cl_kernels/common/reshape_layer.clembed" + }, + { + "common/reverse.cl", +#include "./cl_kernels/common/reverse.clembed" + }, + { + "common/roi_align_layer.cl", +#include "./cl_kernels/common/roi_align_layer.clembed" + }, + { + "common/roi_align_layer_quantized.cl", +#include "./cl_kernels/common/roi_align_layer_quantized.clembed" + }, + { + "common/roi_pooling_layer.cl", +#include "./cl_kernels/common/roi_pooling_layer.clembed" + }, + { + "common/select.cl", +#include "./cl_kernels/common/select.clembed" + }, + { + "common/softmax_layer.cl", +#include "./cl_kernels/common/softmax_layer.clembed" + }, + { + "common/softmax_layer_quantized.cl", +#include "./cl_kernels/common/softmax_layer_quantized.clembed" + }, + { + "common/slice_ops.cl", +#include "./cl_kernels/common/slice_ops.clembed" + }, + { + "common/stack_layer.cl", +#include "./cl_kernels/common/stack_layer.clembed" + }, + { + "common/tile.cl", +#include "./cl_kernels/common/tile.clembed" + }, + { + "common/transpose.cl", +#include "./cl_kernels/common/transpose.clembed" + }, + { + "types.h", +#include "./cl_kernels/types.hembed" + }, + { + "common/unpooling_layer.cl", +#include "./cl_kernels/common/unpooling_layer.clembed" + }, +#ifdef ENABLE_NCHW_KERNELS + { + "nchw/batch_to_space.cl", +#include "./cl_kernels/nchw/batch_to_space.clembed" + }, + { + "nchw/channel_shuffle.cl", +#include "./cl_kernels/nchw/channel_shuffle.clembed" + }, + { + "nchw/upsample_layer.cl", +#include "./cl_kernels/nchw/upsample_layer.clembed" + }, + { + "nchw/depth_to_space.cl", +#include "./cl_kernels/nchw/depth_to_space.clembed" + }, + { + "nchw/dequantization_layer.cl", +#include "./cl_kernels/nchw/dequantization_layer.clembed" + }, + { + "nchw/direct_convolution1x1.cl", +#include "./cl_kernels/nchw/direct_convolution1x1.clembed" + }, + { + "nchw/direct_convolution3x3.cl", +#include "./cl_kernels/nchw/direct_convolution3x3.clembed" + }, + { + "nchw/direct_convolution5x5.cl", +#include "./cl_kernels/nchw/direct_convolution5x5.clembed" + }, + { + "nchw/direct_convolution_quantized.cl", +#include "./cl_kernels/nchw/direct_convolution_quantized.clembed" + }, + { + "nchw/im2col.cl", +#include "./cl_kernels/nchw/im2col.clembed" + }, + { + "nchw/normalization_layer.cl", +#include "./cl_kernels/nchw/normalization_layer.clembed" + }, + { + "nchw/normalize_planar_yuv_layer.cl", +#include "./cl_kernels/nchw/normalize_planar_yuv_layer.clembed" + }, + { + "nchw/normalize_planar_yuv_layer_quantized.cl", +#include "./cl_kernels/nchw/normalize_planar_yuv_layer_quantized.clembed" + }, + { + "nchw/batchnormalization_layer.cl", +#include "./cl_kernels/nchw/batchnormalization_layer.clembed" + }, + { + "nchw/pooling_layer.cl", +#include "./cl_kernels/nchw/pooling_layer.clembed" + }, + { + "nchw/pooling_layer_quantized.cl", +#include "./cl_kernels/nchw/pooling_layer_quantized.clembed" + }, + { + "nchw/prior_box_layer.cl", +#include "./cl_kernels/nchw/prior_box_layer.clembed" + }, + { + "nchw/remap.cl", +#include "./cl_kernels/nchw/remap.clembed" + }, + { + "nchw/reorg_layer.cl", +#include "./cl_kernels/nchw/reorg_layer.clembed" + }, + { + "nchw/scale.cl", +#include "./cl_kernels/nchw/scale.clembed" + }, + { + "nchw/space_to_batch.cl", +#include "./cl_kernels/nchw/space_to_batch.clembed" + }, + { + "nchw/space_to_depth.cl", +#include "./cl_kernels/nchw/space_to_depth.clembed" + }, + { + "nchw/winograd_filter_transform.cl", +#include "./cl_kernels/nchw/winograd_filter_transform.clembed" + }, + { + "nchw/winograd_input_transform.cl", +#include "./cl_kernels/nchw/winograd_input_transform.clembed" + }, + { + "nchw/winograd_output_transform.cl", +#include "./cl_kernels/nchw/winograd_output_transform.clembed" + }, +#endif /* ENABLE_NCHW_KERNELS */ + +#ifdef ENABLE_NHWC_KERNELS + { + "nhwc/batch_to_space.cl", +#include "./cl_kernels/nhwc/batch_to_space.clembed" + }, + { + "nhwc/channel_shuffle.cl", +#include "./cl_kernels/nhwc/channel_shuffle.clembed" + }, + { + "nhwc/upsample_layer.cl", +#include "./cl_kernels/nhwc/upsample_layer.clembed" + }, + { + "nhwc/depth_to_space.cl", +#include "./cl_kernels/nhwc/depth_to_space.clembed" + }, + { + "nhwc/dequantization_layer.cl", +#include "./cl_kernels/nhwc/dequantization_layer.clembed" + }, + { + "nhwc/direct_convolution.cl", +#include "./cl_kernels/nhwc/direct_convolution.clembed" + }, + { + "nhwc/dwc_native_fp_nhwc.cl", +#include "./cl_kernels/nhwc/dwc_native_fp_nhwc.clembed" + }, + { + "nhwc/dwc_native_quantized_nhwc.cl", +#include "./cl_kernels/nhwc/dwc_native_quantized_nhwc.clembed" + }, + { + "nhwc/normalization_layer.cl", +#include "./cl_kernels/nhwc/normalization_layer.clembed" + }, + { + "nhwc/normalize_planar_yuv_layer.cl", +#include "./cl_kernels/nhwc/normalize_planar_yuv_layer.clembed" + }, + { + "nhwc/normalize_planar_yuv_layer_quantized.cl", +#include "./cl_kernels/nhwc/normalize_planar_yuv_layer_quantized.clembed" + }, + { + "nhwc/im2col.cl", +#include "./cl_kernels/nhwc/im2col.clembed" + }, + { + "nhwc/batchnormalization_layer.cl", +#include "./cl_kernels/nhwc/batchnormalization_layer.clembed" + }, + { + "nhwc/pooling_layer.cl", +#include "./cl_kernels/nhwc/pooling_layer.clembed" + }, + { + "nhwc/pooling_layer_quantized.cl", +#include "./cl_kernels/nhwc/pooling_layer_quantized.clembed" + }, + { + "nhwc/remap.cl", +#include "./cl_kernels/nhwc/remap.clembed" + }, + { + "nhwc/reorg_layer.cl", +#include "./cl_kernels/nhwc/reorg_layer.clembed" + }, + { + "nhwc/scale.cl", +#include "./cl_kernels/nhwc/scale.clembed" + }, + { + "nhwc/space_to_batch.cl", +#include "./cl_kernels/nhwc/space_to_batch.clembed" + }, + { + "nhwc/space_to_depth.cl", +#include "./cl_kernels/nhwc/space_to_depth.clembed" + }, + { + "nhwc/winograd_filter_transform.cl", +#include "./cl_kernels/nhwc/winograd_filter_transform.clembed" + }, + { + "nhwc/winograd_input_transform.cl", +#include "./cl_kernels/nhwc/winograd_input_transform.clembed" + }, + { + "nhwc/winograd_output_transform.cl", +#include "./cl_kernels/nhwc/winograd_output_transform.clembed" + }, +#endif /* ENABLE_NHWC_KERNELS */ +#endif /* EMBEDDED_KERNELS */ +}; + +ClKernelLibrary &ClKernelLibrary::get() +{ + static ClKernelLibrary _kernel_library; + return _kernel_library; +} + +std::string ClKernelLibrary::program_name(const std::string &kernel_name) const +{ + // Find which program contains the kernel + auto kernel_program_it = _kernel_program_map.find(kernel_name); + + if(_kernel_program_map.end() == kernel_program_it) + { + ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); + } + + const std::string program_name = kernel_program_it->second; + + return program_name; +} + +void ClKernelLibrary::set_kernel_path(std::string kernel_path) +{ + _kernel_path = std::move(kernel_path); + _kernel_path += "/"; +} + +const std::string &ClKernelLibrary::kernel_path() const +{ + return _kernel_path; +} + +ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &program_name) const +{ +#ifdef EMBEDDED_KERNELS +#ifdef ARM_COMPUTE_COMPRESSED_KERNELS + const auto inflatted_program_source_it = _decompressed_source_map.find(program_name); + if(inflatted_program_source_it != _decompressed_source_map.end()) + { + return ClProgramInfo{ inflatted_program_source_it->second, false }; + } +#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ + + const auto program_source_it = _program_source_map.find(program_name); + if(program_source_it == _program_source_map.end()) + { + ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); + } + std::string program_source = program_source_it->second; + +#ifdef ARM_COMPUTE_COMPRESSED_KERNELS + std::string decompressed_program_source = decompress_zlib(decode_base64(program_source_it->second)); + ARM_COMPUTE_ERROR_ON_MSG(decompressed_program_source.empty(), "Cannot de-compress requested program"); + _decompressed_source_map.insert(std::make_pair(program_name, decompressed_program_source)); + program_source = std::move(decompressed_program_source); +#endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ + + return ClProgramInfo{ program_source, false }; +#else /* EMBEDDED_KERNELS */ + // Check for binary + std::string source_name = _kernel_path + program_name; + std::string binary_name = source_name + "bin"; + std::string program_source{}; + bool is_binary = false; + + if(std::ifstream(binary_name).is_open()) + { + program_source = read_file(binary_name, true); + is_binary = true; + } + else if(std::ifstream(source_name).is_open()) + { + program_source = read_file(source_name, false); + } + else + { + ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str()); + } + + return ClProgramInfo{ program_source, is_binary }; +#endif /* EMBEDDED_KERNELS */ +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h new file mode 100644 index 0000000000..42bec95032 --- /dev/null +++ b/src/gpu/cl/ClKernelLibrary.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_KERNEL_LIBRARY_H +#define ARM_COMPUTE_CL_KERNEL_LIBRARY_H + +#include <map> +#include <string> +#include <tuple> + +namespace arm_compute +{ +namespace opencl +{ +/** ClKernelLibrary contains all the OpenCL kernels that are used throughout the library + * + * @note Kernel library is a singleton to reduce memory requirements + * @note Sole responsibility is just to provide access to the kernel string, + * does not perform any compilation and relevant tasks + */ +class ClKernelLibrary final +{ +private: + /** Default Constructor */ + ClKernelLibrary() = default; + /** Prevent instances of this class from being copied */ + ClKernelLibrary(const ClKernelLibrary &) = delete; + /** Prevent instances of this class from being copied */ + const ClKernelLibrary &operator=(const ClKernelLibrary &) = delete; + +public: + /** Structure to encapsulte program related information */ + struct ClProgramInfo + { + std::string program{}; /**< Program raw string */ + bool is_binary{ false }; /**< Flag that indicates if is in binary format */ + }; + +public: + /** Access the KernelLibrary singleton + * + * @return The KernelLibrary instance + */ + static ClKernelLibrary &get(); + /** Sets the path that the kernels reside in + * + * @param[in] kernel_path Path of the kernel + */ + void set_kernel_path(std::string kernel_path); + /** Gets the path that the kernels reside in + */ + const std::string &kernel_path() const; + /** Gets the source of the selected program + * + * @param[in] program_name Program name + * + * @return A pair with the source (false) or the binary (true), of the selected program + */ + ClProgramInfo program(const std::string &program_name) const; + /** Returns the program name given a kernel name + * + * @return Program name + */ + std::string program_name(const std::string &kernel_name) const; + +private: + std::string _kernel_path{}; /**< Path to the kernels folder. */ + mutable std::map<std::string, std::string> _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */ + static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs. + Used for compile-time kernel inclusion. >*/ +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_KERNEL_LIBRARY_H */ diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h new file mode 100644 index 0000000000..52ea3c9183 --- /dev/null +++ b/src/gpu/cl/IClKernel.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ICL_KERNEL_H +#define ARM_COMPUTE_ICL_KERNEL_H + +#include "arm_compute/core/ITensorInfo.h" +#include "src/core/CL/ICLKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +using IClKernel = arm_compute::ICLKernel; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_ICL_KERNEL_H */ diff --git a/src/gpu/cl/IClOperator.h b/src/gpu/cl/IClOperator.h new file mode 100644 index 0000000000..049bf05dc1 --- /dev/null +++ b/src/gpu/cl/IClOperator.h @@ -0,0 +1,37 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ICL_OPERATOR_H +#define ARM_COMPUTE_ICL_OPERATOR_H + +#include "arm_compute/core/ITensorInfo.h" +#include "arm_compute/runtime/CL/ICLOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +using IClOperator = experimental::ICLOperator; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_ICL_OPERATOR_H */ diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp new file mode 100644 index 0000000000..13d55b3f5a --- /dev/null +++ b/src/gpu/cl/kernels/ClActivationKernel.cpp @@ -0,0 +1,255 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClActivationKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" + +#include "support/StringSupport.h" + +#include <set> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32); + + static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = + { + ActivationLayerInfo::ActivationFunction::RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, + ActivationLayerInfo::ActivationFunction::HARD_SWISH, + ActivationLayerInfo::ActivationFunction::LEAKY_RELU, + }; + const DataType data_type = src->data_type(); + const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); + const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0), + "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); + + // Checks performed when destination is configured + if((dst != nullptr) && (dst->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClActivationKernel::ClActivationKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + + auto padding_info = get_padding_info({ src, dst }); + + _run_in_place = (dst == nullptr) || (dst == src); + + if(dst != nullptr) + { + // Destination auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + } + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, (dst != nullptr) ? dst : nullptr, act_info)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); + + const DataType dt = src->data_type(); + float a_const = act_info.a(); + float b_const = act_info.b(); + + const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); + const bool is_quantized = is_data_type_quantized(dt); + const bool perform_activation_in_float = + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) + || (f_act == ActivationLayerInfo::ActivationFunction::TANH) + || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) + || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option_if(perform_activation_in_float, "-DFLOAT_DOMAIN"); + build_opts.add_option_if(_run_in_place, "-DIN_PLACE"); + build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + + std::string kernel_name = std::string("activation_layer"); + + // Set quantization info build options + if(is_quantized) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + + if(!perform_activation_in_float) + { + int a_const_int = 0; + int b_const_int = 0; + + // Create quantized version of constants a, b if needed + switch(dt) + { + case DataType::QASYMM8: + { + a_const_int = quantize_qasymm8(a_const, iq_info); + b_const_int = quantize_qasymm8(b_const, iq_info); + } + break; + case DataType::QASYMM8_SIGNED: + { + a_const_int = quantize_qasymm8_signed(a_const, iq_info); + b_const_int = quantize_qasymm8_signed(b_const, iq_info); + } + break; + case DataType::QSYMM16: + { + a_const_int = quantize_qsymm16(a_const, iq_info); + b_const_int = quantize_qsymm16(b_const, iq_info); + } + break; + default: + break; + } + build_opts.add_option(("-DA_VAL=" + support::cpp11::to_string(a_const_int))); + build_opts.add_option(("-DB_VAL=" + support::cpp11::to_string(b_const_int))); + } + else + { + build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); + build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); + } + + // Quantized value of 0 corresponds to the offset o1 + build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0"))); + build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale))); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset)); + + // Set correct kernel name + kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant"); + + // Set scale and offset of the source and destination if they have different quantization info + if(dst != nullptr) + { + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + if(iq_info != oq_info) + { + build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale))); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset)); + } + } + } + else + { + // Set A, B constants in build options for float types + build_opts.add_option(("-DA_VAL=" + float_to_string_with_full_precision(a_const))); + build_opts.add_option(("-DB_VAL=" + float_to_string_with_full_precision(b_const))); + } + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + // Set config_id for enabling LWS tuning + _config_id = "activation_layer_"; + _config_id += lower_string(string_from_data_type(dt)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(1)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClActivationKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, act_info)); + return Status{}; +} + +void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + if(!_run_in_place) + { + add_3D_tensor_argument(idx, dst, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h new file mode 100644 index 0000000000..95e010889e --- /dev/null +++ b/src/gpu/cl/kernels/ClActivationKernel.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ACTIVATION_KERNEL_H +#define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the activation kernel. */ +class ClActivationKernel : public IClKernel +{ +public: + ClActivationKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClActivationKernel); + /** Configure kernel for a given list of arguments + * + * @note If the output tensor is a nullptr, the activation function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src Source tensor info. In case of @p dst tensor = nullptr, this tensor will store the result + * of the activation function. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] act_info Activation layer information. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClActivationKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; + +private: + bool _run_in_place{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ACTIVATION_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp new file mode 100644 index 0000000000..8c2af5ffb6 --- /dev/null +++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp @@ -0,0 +1,153 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimZ) != dst->dimension(Window::DimZ)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(3) + batch_offset > dst->dimension(3)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(4, src, dst); + + return Status{}; +} +} // namespace + +ClBatchConcatenateKernel::ClBatchConcatenateKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst)); + + auto padding_info = get_padding_info({ src, dst }); + + _batch_offset = batch_offset; + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); + + // Add build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + + // Create kernel + _kernel = create_kernel(compile_context, "concatenate", build_opts.options()); + + // Configure kernel window + auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + win.set(3, Window::Dimension(0, src->tensor_shape()[3], 1)); + ICLKernel::configure_internal(win); + + // Set config_id for enabling LWS tuning + _config_id = "concatenate_"; + _config_id += support::cpp11::to_string(3); + _config_id += "_"; + _config_id += support::cpp11::to_string(batch_offset); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(3)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClBatchConcatenateKernel::validate(const arm_compute::ITensorInfo *src, + unsigned int batch_offset, + const arm_compute::ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, batch_offset, dst)); + return Status{}; +} + +void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window slice = window.first_slice_window_3D(); + + const int offset_to_first_elements_in_bytes = _batch_offset * dst->info()->strides_in_bytes()[3]; + + unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters + _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace opencl +} // namespace kernels +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h new file mode 100644 index 0000000000..f6b7c0ed09 --- /dev/null +++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H +#define ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the batch concatenate kernel. + * The src tensor will be concatenated into the destination tensor. + */ +class ClBatchConcatenateKernel : public IClKernel +{ +public: + ClBatchConcatenateKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClBatchConcatenateKernel); + /** Initialise the kernel's source and destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: All. + * @param[in] batch_offset The offset on axis # 3. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + * + * @note: The dst tensor's low two dimensions can't be smaller than the src one's. + * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. + * + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClBatchConcatenateKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int batch_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; + +private: + unsigned int _batch_offset{ 0 }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_BATCH_CONCATENATE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp new file mode 100644 index 0000000000..48caf21d16 --- /dev/null +++ b/src/gpu/cl/kernels/ClCastKernel.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClCastKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_UNUSED(policy); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src == dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, + 1, + DataType::U8, DataType::S8, DataType::QSYMM8_PER_CHANNEL, DataType::S16, + DataType::U16, DataType::U32, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, + 1, + DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16, + DataType::U16, DataType::U32, DataType::S32, DataType::F16, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different"); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClCastKernel::ClCastKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Auto initialize dst shape if not initialized (We can only auto-configure the shape, datatype must be given) + set_shape_if_empty(*dst, src->tensor_shape()); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy)); + + auto padding_info = get_padding_info({ src, dst }); + + // Get data sizes + const size_t src_size = data_size_from_type(src->data_type()); + const size_t dst_size = data_size_from_type(dst->data_type()); + + // Get number of elements to process per iterations + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); + // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined + build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE"); + build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT"); + build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED"); + + // Create kernel + const std::string kernel_name = (src_size >= dst_size) ? "cast_down" : "cast_up"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + // Collapse window + const Window &full_window = window(); + Window collapsed_window = full_window.collapse_if_possible(full_window, Window::DimZ); + ICLKernel::configure_internal(collapsed_window); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(src->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); +} + +Status ClCastKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, policy)); + return Status{}; +} + +void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h new file mode 100644 index 0000000000..5c223fc5fa --- /dev/null +++ b/src/gpu/cl/kernels/ClCastKernel.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CAST_KERNEL_H +#define ARM_COMPUTE_CL_CAST_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Casts a given tensor to a new type + * + * @note When casting between quantized types the scale and zeroPoint are ignored + */ +class ClCastKernel : public IClKernel +{ +public: + ClCastKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCastKernel); + /** Set the src and dst of the kernel. + * + * Valid conversions src -> dst : + * + * - QSYMM8_PER_CHANNEL -> QASYMM8 (ATTENTION: it is the user's responsibility to keep track of the quantization info in the TensorInfo meta-data) + * - U8 -> S8, U16, S16, U32, S32, F16, F32 + * - U16 -> U8, S8, S16, U32, S32, F16, F32 + * - S16 -> U8, S8, U16, U32, S32, F16, F32 + * - U32 -> U8, S8, U16, S16, S32, F16, F32 + * - S32 -> U8, S8, U16, S16, U32, F16, F32 + * - F16 -> U8, S8, U16, S16, U32, F32 + * - F32 -> U8, S8, U16, S16, U32, F16 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The source tensor to convert. Data types supported: U8/S8/QSYMM8_PER_CHANNEL/U16/S16/U32/S32/F16/F32. + * @param[out] dst The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. + * @param[in] policy Conversion policy + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCastKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CAST_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp new file mode 100644 index 0000000000..ac7c4a43f9 --- /dev/null +++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClCol2ImKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include <cmath> + +namespace arm_compute +{ +using namespace misc::shape_calculator; +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + + // Checks performed when output is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW"); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW)); + + constexpr unsigned int num_elems_read_per_iteration = 8; + + // Configure window + Window win = calculate_max_window(*src, Steps(num_elems_read_per_iteration)); + + // Update window and padding just for the input tensor as we cannot access out-of-bounds elements in the output one + AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration); + bool window_changed = update_window_and_padding(win, input_access); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +ClCol2ImKernel::ClCol2ImKernel() + : _convolved_dims() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, convolved_dims, num_groups)); + + _convolved_dims = convolved_dims; + + const DataType data_type = src->data_type(); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size())); + build_opts.add_option("-DWIDTH_INPUT=" + support::cpp11::to_string(src->dimension(0))); + build_opts.add_option("-DWIDTH_OUTPUT=" + support::cpp11::to_string(_convolved_dims.width)); + build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); + + _kernel = create_kernel(compile_context, "col2im", build_opts.options()); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, _convolved_dims, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "col2im_"; + _config_id += lower_string(string_from_data_type(src->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(num_groups); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); +} + +Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first); + return Status{}; +} + +void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); + + bool is_collapsed = false; + bool is_collapsed_out = false; + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window out_window; + out_window.use_tensor_dimensions(dst->info()->tensor_shape()); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &is_collapsed); + Window collapsed_out = out_window.collapse_if_possible(out_window, 3, &is_collapsed_out); + + ARM_COMPUTE_ERROR_ON(is_collapsed != is_collapsed_out); + + Window slice = collapsed.first_slice_window_3D(); + Window slice_out = collapsed_out.first_slice_window_4D(); + do + { + // Set inputs + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_4D_tensor_argument(idx, dst, slice_out); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h new file mode 100644 index 0000000000..e19b7c8e16 --- /dev/null +++ b/src/gpu/cl/kernels/ClCol2ImKernel.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_COL2IM_KERNEL_H +#define ARM_COMPUTE_CL_COL2IM_KERNEL_H + +#include "arm_compute/core/Size2D.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the col2im reshaping kernel. + * + * Rearranges each matrix column into image blocks. It's the inverse operation of @ref opencl::kernels::ClIm2ColKernel. + * + * For example, a vector of 9 elements can be reshaped to a block(image) of 3x3: + * + * @f[ + * \left( \begin{array}{ccccccccc} + * a0 & a1 & a2 & a3 & a4 & a5 & a6 & a7 & a8 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccc} + * a0 & a1 & a2 \\ + * a3 & a4 & a5 \\ + * a6 & a7 & a8 \\ + * \end{array} \right) + * @f] + */ +class ClCol2ImKernel : public IClKernel +{ +public: + /** Default constructor */ + ClCol2ImKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCol2ImKernel); + /** Set the input and output of the kernel. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The input tensor info to convert. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[out] dst The output tensor info. 3 lower dimensions represent a single output [width, height, OFM], + * while the rest represent batch of outputs. Data types supported: Same as @p input. Data layout: NCHW + * @param[in] convolved_dims Output convolved dimensions. + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClCol2ImKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +public: + Size2D _convolved_dims; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /*ARM_COMPUTE_CL_COL2IM_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp new file mode 100644 index 0000000000..716dec1f30 --- /dev/null +++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, + DataLayout data_layout) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto initialisation if not yet initialized + auto_init_if_empty(*dst, *src->clone()); + + auto padding_info = get_padding_info({ src, dst }); + + ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout)); + + const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; + + const int width_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(src_data_layout, DataLayoutDimension::CHANNEL); + + const unsigned int num_elems_per_src_plane = original_src_shape[width_idx] * original_src_shape[height_idx]; + const unsigned int num_channels = original_src_shape[channel_idx]; + + const unsigned int factor_1 = (data_layout == DataLayout::NCHW) ? num_elems_per_src_plane : num_channels; + const unsigned int factor_2 = (data_layout == DataLayout::NCHW) ? num_channels : num_elems_per_src_plane; + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); + build_opts.add_option("-DFACTOR_1=" + support::cpp11::to_string(factor_1)); + build_opts.add_option("-DFACTOR_2=" + support::cpp11::to_string(factor_2)); + + // Create kernel + _kernel = create_kernel(compile_context, "convert_fc_weights", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, + DataLayout data_layout) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() != 2); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(1) != original_src_shape.total_size_lower(3)); + ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); + + // Checks performed when dst is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} + +void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src, window); + add_2D_tensor_argument(idx, dst, window); + enqueue(queue, *this, window, lws_hint()); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h new file mode 100644 index 0000000000..16000e82f6 --- /dev/null +++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H +#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +/** Interface to convert the 2D Fully Connected weights from NCHW to NHWC or vice versa. + * + * @note This function can be applied to the 2D weights used by a Fully Connected layer if: + * - It follows a Convolution layer + * - The data layout used by the network does not match the one the model has been trained in. + * + * @note This function assumes the weights are already reshaped (transposed) + */ +namespace opencl +{ +namespace kernels +{ +class ClConvertFullyConnectedWeightsKernel : public IClKernel +{ +public: + ClConvertFullyConnectedWeightsKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClConvertFullyConnectedWeightsKernel); + /** Set the src and dst tensor. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source weights tensor info to convert. Must be 2 dimensional. Data types supported: All. + * @param[out] dst The converted weights tensor info. Shape and Data Type: Same as @p src. + * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp new file mode 100644 index 0000000000..eefdc925a4 --- /dev/null +++ b/src/gpu/cl/kernels/ClCopyKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClCopyKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + + // Validate dst if initialized + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + if(dst_window == nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst_window->shape()); + } + } + + return Status{}; +} + +} // namespace + +ClCopyKernel::ClCopyKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window)); + + auto padding_info = get_padding_info({ src, dst }); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, *src); + + // Configure window + const unsigned int vec_size_x = adjust_vec_size(16 / src->element_size(), src->dimension(0)); + + const Window win_config = calculate_max_window(*src, Steps(vec_size_x)); + + if(dst_window != nullptr) + { + _has_dst_window = true; + _dst_window = Window(*dst_window); + const int width_x = dst_window->num_iterations(0); + const int vec_size_x_leftover = width_x % vec_size_x; + const bool multi_access_x = width_x >= static_cast<int32_t>(vec_size_x); + + if(multi_access_x) + { + _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x)); + } + + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover)); + } + else + { + const int width_x = src->tensor_shape().x(); + const int vec_size_x_leftover = width_x % vec_size_x; + + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover)); + } + + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + + // Build kernel + _kernel = create_kernel(compile_context, "copy_tensor", build_opts.options()); + + // Validate and set the window + ICLKernel::configure_internal(win_config); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window)); + + return Status{}; +} + +void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window slice; + + if(_has_dst_window) + { + slice = window.first_slice_window_3D(); + Window out_slice = _dst_window.first_slice_window_3D(); + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, out_slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice)); + } + else + { + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + slice = collapsed.first_slice_window_3D(); + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); + } +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h new file mode 100644 index 0000000000..63fd806586 --- /dev/null +++ b/src/gpu/cl/kernels/ClCopyKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_COPY_KERNEL_H +#define ARM_COMPUTE_CL_COPY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to perform a copy between two tensors */ +class ClCopyKernel : public IClKernel +{ +public: + ClCopyKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCopyKernel); + /** Initialize the kernel's src, dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: All. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCopyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + Window _dst_window{}; + bool _has_dst_window{}; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_COPY_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp new file mode 100644 index 0000000000..c7e5537977 --- /dev/null +++ b/src/gpu/cl/kernels/ClCropKernel.cpp @@ -0,0 +1,136 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClCropKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CPP/Validate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include <map> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +ClCropKernel::ClCropKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, + float extrapolation_value, Window *dst_window) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window)); + + _start = start; + _batch_index = batch_index; + _extrapolation_value = extrapolation_value; + + const int vec_size_x = 4; + // Create and update the window (if needed) + Window win = calculate_max_window(*dst); + + if(dst_window != nullptr) + { + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window); + win = *dst_window; + } + + const int dst_width_x = win.num_iterations(0); + const bool multi_access_x = dst_width_x >= vec_size_x; + const bool remainder_x = dst_width_x % vec_size_x > 0; + + if(multi_access_x) + { + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + } + ICLKernel::configure_internal(win); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0))); + build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED="); + build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED="); + _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options()); +} + +Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +{ + ARM_COMPUTE_UNUSED(extrapolation_value, dst_window); + ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4); + ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0); + ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) + || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3)); + if(dst_window != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1); + } + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(dst->num_dimensions() > 3); + } + return Status{}; +} + +void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window in_slice = Window(); + in_slice.use_tensor_dimensions(src->info()->tensor_shape()); + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step())); + in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1)); + + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, in_slice); + add_3D_tensor_argument(idx, dst, window); + add_argument(idx, _start.x); + add_argument(idx, _start.y); + enqueue(queue, *this, window, lws_hint()); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h new file mode 100644 index 0000000000..2f166e184c --- /dev/null +++ b/src/gpu/cl/kernels/ClCropKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CROP_KERNEL_H +#define ARM_COMPUTE_CL_CROP_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to perform a copy between two tensors */ +class ClCropKernel : public IClKernel +{ +public: + ClCropKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClCropKernel); + /** Configure kernel + * + * @note Supported tensor rank: up to 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC. + * @param[out] dst Destination tensor info. Data type supported: F32 + * @param[in] start Coordinates of where to start cropping the image. + * @param[in] end Coordinates of where to end cropping the image. + * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src. + * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. + * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, + Window *dst_window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCropKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, + Window *dst_window = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + Coordinates2D _start{}; + uint32_t _batch_index{}; + float _extrapolation_value{}; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CROP_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp new file mode 100644 index 0000000000..d716f1e430 --- /dev/null +++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp @@ -0,0 +1,139 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) != dst->dimension(Window::DimY)); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(2) + depth_offset > dst->dimension(2)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(3, src, dst); + + return Status{}; +} +} // namespace + +ClDepthConcatenateKernel::ClDepthConcatenateKernel() + : _depth_offset(0) +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst)); + + auto padding_info = get_padding_info({ src, dst }); + + _depth_offset = depth_offset; + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16 / src->element_size(), src->dimension(0)); + + // Add build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + + // Create kernel + _kernel = create_kernel(compile_context, "concatenate", build_opts.options()); + + // Configure kernel window + auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + win.set(Window::DimZ, Window::Dimension(0, src->tensor_shape().z(), 1)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClDepthConcatenateKernel::validate(const arm_compute::ITensorInfo *src, + unsigned int depth_offset, + const arm_compute::ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, depth_offset, dst)); + return Status{}; +} + +void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window slice = window.first_slice_window_3D(); + + const int offset_to_first_elements_in_bytes = _depth_offset * dst->info()->strides_in_bytes()[2]; + + unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters + _kernel.setArg<cl_int>(idx, offset_to_first_elements_in_bytes); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h new file mode 100644 index 0000000000..4739677f3b --- /dev/null +++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h @@ -0,0 +1,74 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H +#define ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the depth concatenate kernel. + * The src tensor will be concatenated into the dst tensor. + */ +class ClDepthConcatenateKernel : public IClKernel +{ +public: + ClDepthConcatenateKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDepthConcatenateKernel); + /** Initialise the kernel's source and destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] depth_offset The offset on the Z axis. + * @param[in,out] dst Destination tensor info. Data types supported: Same as @p src. + * + * @note: The dst tensor's low two dimensions can't be smaller than the src one's. + * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. + * + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClDepthConcatenateKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int depth_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; + +private: + unsigned int _depth_offset; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DEPTH_CONCATENATE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp new file mode 100644 index 0000000000..c2e8f2f91f --- /dev/null +++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp @@ -0,0 +1,158 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClDequantizeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); + + if(dst->tensor_shape().total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClDequantizeKernel::ClDequantizeKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); + + auto padding_info = get_padding_info({ src, dst }); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + const int vec_size_x = 16 / dst->element_size(); + const int output_width_x = dst->tensor_shape().x(); + const bool multi_access_x = (output_width_x / vec_size_x > 0); + + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->data_type()); + std::string kernel_name = "dequantization_layer"; + + // Create kernel + CLBuildOptions build_opts; + if(!is_quantized_per_channel) + { + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + const int qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0; + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset)); + } + else + { + kernel_name += "_per_channel"; + kernel_name += src->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc"; + } + + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + + // Create kernel name + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*dst); + if(multi_access_x) + { + win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + } + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClDequantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type()); + + // Collapse windo + Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3); + Window slice = new_window.first_slice_window_3D(); + + if(is_quantized_per_channel) + { + unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters + _kernel.setArg(idx++, src->quantization().scale->cl_buffer()); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(new_window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.h b/src/gpu/cl/kernels/ClDequantizeKernel.h new file mode 100644 index 0000000000..a32f506c9a --- /dev/null +++ b/src/gpu/cl/kernels/ClDequantizeKernel.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H +#define ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the dequantization layer kernel. */ +class ClDequantizeKernel : public IClKernel +{ +public: + ClDequantizeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDequantizeKernel); + /** Initialise the kernel's input and output + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] dst Destination tensor info. Data types supported: F16/F32. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClDequantizeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DEQUANTIZE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp new file mode 100644 index 0000000000..cbeb9c43e9 --- /dev/null +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -0,0 +1,672 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/ITensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLUtils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + + const DataLayout data_layout = src->data_layout(); + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) + && std::get<0>(conv_info.stride()) > 2, + "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(data_layout != DataLayout::NHWC && !is_data_type_float(src->data_type()) && act_info.enabled(), + "Activation supported only for floating point and NHWC."); + + if(data_layout == DataLayout::NCHW) + { + if(is_data_type_quantized(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, + "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5, + "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); + } + } + + if(biases != nullptr) + { + if(is_data_type_quantized_asymmetric(src->data_type())) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(weights, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), + "Biases size and number of src feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, + "Biases should be one dimensional"); + } + + // Checks performed when dst is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + const auto data_type = src->data_type(); + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + } + return Status{}; +} + +inline bool can_run_optimized_kernel_for_bifrost_nchw(GPUTarget gpu_target, unsigned int conv_stride_x, unsigned int conv_stride_y, unsigned int kernel_size, + DataType data_type, DataLayout data_layout) +{ + return gpu_target_is_in(gpu_target, + GPUTarget::G71, GPUTarget::G72, GPUTarget::G76, + GPUTarget::G51, GPUTarget::G51BIG, GPUTarget::G51LIT, + GPUTarget::G52, GPUTarget::G52LIT) + && (kernel_size <= 5) + && (conv_stride_x == 1) && (conv_stride_y == 1) + && (data_type == DataType::F32) + && (data_layout == DataLayout::NCHW); +} + +inline void setup_num_elems_nchw(unsigned int &num_elems_read_per_iteration_x, unsigned int &num_elems_read_per_iteration_y, + unsigned int &num_elems_written_per_iteration_x, unsigned int &num_elems_written_per_iteration_y, + unsigned int kernel_size, const PadStrideInfo &conv_info, const GPUTarget target, ITensorInfo *src) +{ + const DataType data_type = src->data_type(); + const DataLayout data_layout = src->data_layout(); + unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + + const bool run_optimized_bifrost = can_run_optimized_kernel_for_bifrost_nchw(target, conv_stride_x, conv_stride_y, kernel_size, data_type, data_layout); + + if(run_optimized_bifrost) + { + // Configure kernel window + switch(kernel_size) + { + case 1: + { + num_elems_read_per_iteration_x = 4; + num_elems_read_per_iteration_y = 4; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 4; + break; + } + case 3: + { + num_elems_read_per_iteration_x = 6; + num_elems_read_per_iteration_y = 5; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 3; + break; + } + case 5: + { + num_elems_read_per_iteration_x = 8; + num_elems_read_per_iteration_y = 6; + num_elems_written_per_iteration_x = 4; + num_elems_written_per_iteration_y = 2; + break; + } + default: + { + ARM_COMPUTE_ERROR("Kernel size not optimized for Bifrost"); + } + } + } + else + { + num_elems_read_per_iteration_y = kernel_size; + num_elems_written_per_iteration_x = 8; + num_elems_written_per_iteration_y = 1; + switch(kernel_size) + { + case 1: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 8; + break; + case 2: + num_elems_read_per_iteration_x = 16; + break; + case 3: + switch(src->element_size()) + { + case 1: + num_elems_read_per_iteration_x = 28; + break; + case 2: + num_elems_read_per_iteration_x = 24; + break; + case 4: + num_elems_read_per_iteration_x = 22; + break; + default: + ARM_COMPUTE_ERROR("Invalid data size"); + } + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + case 3: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 10; + break; + case 2: + num_elems_read_per_iteration_x = 17; + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + case 5: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 12; + break; + case 2: + num_elems_read_per_iteration_x = 20; + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + case 9: + switch(conv_stride_x) + { + case 1: + num_elems_read_per_iteration_x = 16; + break; + case 2: + num_elems_read_per_iteration_x = 24; + break; + default: + ARM_COMPUTE_ERROR("Invalid convolution stride X"); + } + break; + default: + ARM_COMPUTE_ERROR("Invalid direct convolution size"); + } + } +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, const PadStrideInfo &conv_info, const GPUTarget target) +{ + const DataLayout data_layout = src->data_layout(); + + // Get dst shape + TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, output_shape, + 1, + src->data_type(), + src->quantization_info()); + + if(data_layout == DataLayout::NHWC) + { + const unsigned int vec_size = std::min(static_cast<unsigned int>(dst->tensor_shape()[0]), 4u); + unsigned int num_rows = 1U; + if(dst->tensor_shape()[0] > 16) + { + num_rows = src->data_type() == DataType::F32 ? 2U : 4U; + } + + // Create window and update padding + Window win = calculate_max_window(output_shape, Steps(vec_size, num_rows)); + return std::make_pair(Status{}, win); + } + else if(data_layout == DataLayout::NCHW) + { + const int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int kernel_size = weights->dimension(width_idx); + + unsigned int num_elems_read_per_iteration_x = 0; + unsigned int num_elems_read_per_iteration_y = 0; + unsigned int num_elems_written_per_iteration_x = 0; + unsigned int num_elems_written_per_iteration_y = 0; + + unsigned int conv_pad_left = conv_info.pad_left(); + unsigned int conv_pad_top = conv_info.pad_top(); + unsigned int conv_stride_x = std::get<0>(conv_info.stride()); + unsigned int conv_stride_y = std::get<1>(conv_info.stride()); + + setup_num_elems_nchw(num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, + num_elems_written_per_iteration_x, num_elems_written_per_iteration_y, + kernel_size, conv_info, target, src); + + // Create window and update padding + bool window_changed = false; + Window win = calculate_max_window(*dst, Steps(num_elems_written_per_iteration_x, num_elems_written_per_iteration_y)); + + AccessWindowRectangle input_access(src, -conv_pad_left, -conv_pad_top, num_elems_read_per_iteration_x, num_elems_read_per_iteration_y, conv_stride_x, conv_stride_y); + AccessWindowStatic weights_access(weights, 0, 0, kernel_size, kernel_size); + AccessWindowRectangle output_access(dst, 0, 0, num_elems_written_per_iteration_x, num_elems_written_per_iteration_y); + window_changed = update_window_and_padding(win, input_access, weights_access, output_access); + output_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); + } + else + { + ARM_COMPUTE_ERROR("Not supported"); + } +} + +bool export_to_cl_image_support(ITensorInfo *tensor, GPUTarget gpu_target, DataLayout data_layout) +{ + if(tensor->tensor_shape()[0] % 4 || (data_layout != DataLayout::NHWC)) + { + return false; + } + + // If not floating point + if(!is_data_type_float(tensor->data_type())) + { + return false; + } + + if(gpu_target == GPUTarget::G71 || get_arch_from_target(gpu_target) == GPUTarget::MIDGARD) + { + return false; + } + + // Check if the cl_khr_image2d_from_buffer extension is supported on the target platform + if(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device())) + { + return false; + } + + // Check cl image pitch alignment + if(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0) + { + return false; + } + + const size_t image_w = tensor->tensor_shape()[0] / 4; + const size_t image_h = tensor->tensor_shape()[1] * tensor->tensor_shape()[2] * tensor->tensor_shape()[3]; + const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); + const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); + + if(image_w > max_image_w || image_h > max_image_h) + { + return false; + } + + return true; +} + +} // namespace + +BorderSize ClDirectConv2dKernel::border_size() const +{ + return _border_size; +} + +ClDirectConv2dKernel::ClDirectConv2dKernel() +{ + _type = CLKernelType::DIRECT; +} + +void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + // Perform validation + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info)); + + const int conv_stride_x = std::get<0>(conv_info.stride()); + const int conv_stride_y = std::get<1>(conv_info.stride()); + + _data_layout = src->data_layout(); + _conv_info = conv_info; + + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const unsigned int kernel_size = weights->dimension(width_idx); + const DataType data_type = src->data_type(); + + const GPUTarget gpu_target = get_target(); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, weights, dst, conv_info, gpu_target); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + std::stringstream kernel_name; + CLBuildOptions build_options; + + if(_data_layout == DataLayout::NHWC) + { + _border_size = BorderSize(); + + kernel_name << "direct_convolution_nhwc"; + + const unsigned int n0 = win_config.second.x().step(); + const unsigned int m0 = win_config.second.y().step(); + const unsigned int k0 = adjust_vec_size(is_data_type_quantized(data_type) ? 16u : 8u, src->dimension(channel_idx)); + const unsigned int partial_store_n0 = dst->dimension(channel_idx) % n0; + const unsigned int pad_left = conv_info.pad_left(); + const unsigned int pad_top = conv_info.pad_top(); + const bool export_to_cl_image = export_to_cl_image_support(weights, gpu_target, _data_layout); + + // Update the padding for the weights tensor if we can export to cl_image + if(export_to_cl_image) + { + gemm::update_padding_for_cl_image(weights); + } + + if(biases != nullptr) + { + build_options.add_option(std::string("-DHAS_BIAS")); + build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); + } + + build_options.add_option("-cl-fast-relaxed-math"); + build_options.add_option("-DSRC_TENSOR_TYPE=BUFFER"); + build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(width_idx))); + build_options.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(height_idx))); + build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(channel_idx))); + build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_options.add_option("-DDST_TENSOR_TYPE=BUFFER"); + build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(width_idx))); + build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(height_idx))); + build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(channel_idx))); + build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_options.add_option_if_else(export_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); + build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); + build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); + build_options.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x)); + build_options.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_stride_y)); + build_options.add_option("-DPAD_LEFT=" + support::cpp11::to_string(pad_left)); + build_options.add_option("-DPAD_TOP=" + support::cpp11::to_string(pad_top)); + build_options.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_options.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); + build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); + build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + + PixelValue zero_value = PixelValue(0, src->data_type(), src->quantization_info()); + int zero_value_s32; + zero_value.get(zero_value_s32); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DIS_QUANTIZED"); + build_options.add_option("-DDST_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DDST_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); + build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(zero_value_s32)); + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(DataType::S32)); + } + else + { + build_options.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0)); + build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0)); + build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + } + } + else + { + _border_size = BorderSize(src->padding()); + + kernel_name << "direct_convolution" << kernel_size << "x" << kernel_size; + + build_options.add_option_if(biases != nullptr, std::string("-DHAS_BIAS")); + + const bool run_optimized_for_bifrost = can_run_optimized_kernel_for_bifrost_nchw(gpu_target, conv_stride_x, conv_stride_y, kernel_size, data_type, _data_layout); + + if(run_optimized_for_bifrost) + { + build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + + kernel_name << "_f32_bifrost"; + } + else + { + build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); + build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); + build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); + build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); + + if(is_data_type_quantized(data_type)) + { + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + + float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; + int output_multiplier = 0; + int output_shift = 0; + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift); + build_options.add_option("-DOUTPUT_MULTIPLIER=" + support::cpp11::to_string(output_multiplier)); + build_options.add_option("-DOUTPUT_SHIFT=" + support::cpp11::to_string(output_shift)); + build_options.add_option("-DKERNEL_SIZE=" + support::cpp11::to_string(kernel_size)); + build_options.add_option("-DINPUT_OFFSET=" + support::cpp11::to_string(-iqinfo.offset)); + build_options.add_option("-DWEIGHTS_OFFSET=" + support::cpp11::to_string(-wqinfo.offset)); + build_options.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(oqinfo.offset)); + + kernel_name.str("direct_convolution_quantized"); + } + } + } + + _kernel = create_kernel(compile_context, kernel_name.str(), build_options.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name.str(); + _config_id += "_"; + _config_id += lower_string(string_from_data_type(data_type)); + _config_id += "_"; + _config_id += support::cpp11::to_string(kernel_size); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().left); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().top); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().right); + _config_id += "_"; + _config_id += support::cpp11::to_string(border_size().bottom); + _config_id += "_"; + _config_id += support::cpp11::to_string(conv_stride_x); + _config_id += "_"; + _config_id += support::cpp11::to_string(conv_stride_y); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(width_idx)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(height_idx)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(_data_layout)); +} + +Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), weights->clone().get(), dst->clone().get(), conv_info, target).first); + + return Status{}; +} + +void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + // Get initial windows + Window slice = window.first_slice_window_3D(); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if(_data_layout == DataLayout::NHWC) + { + cl::Image2D weights_cl_image; + + const size_t dim_y_collapsed = ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2), slice.y().step()); + const bool export_to_cl_image = export_to_cl_image_support(weights->info(), get_target(), _data_layout); + + slice.set(Window::DimY, Window::Dimension(0, dim_y_collapsed, slice.y().step())); + slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(3), 1)); + + if(export_to_cl_image) + { + const size_t image_w = weights->info()->dimension(0) / 4; + const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); + const TensorShape shape2d(image_w, image_h); + const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; + + // Export cl_buffer to cl_image + weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch); + } + + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, slice); + add_4D_tensor_argument(idx, dst, slice); + if(export_to_cl_image) + { + _kernel.setArg(idx++, weights_cl_image); + } + add_4D_tensor_argument(idx, weights, slice); + if(biases != nullptr) + { + add_1D_tensor_argument(idx, biases, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + else + { + Window win_in = window; + + win_in.adjust(Window::DimX, -_conv_info.pad_left(), true); + win_in.adjust(Window::DimY, -_conv_info.pad_top(), true); + + const int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + const int conv_stride_x = std::get<0>(_conv_info.stride()); + const int conv_stride_y = std::get<1>(_conv_info.stride()); + + win_in.set_dimension_step(width_idx, window[width_idx].step() * conv_stride_x); + win_in.set_dimension_step(height_idx, window[height_idx].step() * conv_stride_y); + + Window slice_in = win_in.first_slice_window_3D(); + unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); + add_3D_tensor_argument(idx1, weights, slice); + + if(biases != nullptr) + { + Window slice_biases; + slice_biases.use_tensor_dimensions(biases->info()->tensor_shape()); + add_1D_tensor_argument(idx1, biases, slice_biases); + } + + _kernel.setArg(idx1++, static_cast<unsigned int>(weights->info()->strides_in_bytes()[3])); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice_in); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice) && win_in.slide_window_slice_3D(slice_in)); + } +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h new file mode 100644 index 0000000000..4041c7bf27 --- /dev/null +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h @@ -0,0 +1,89 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H +#define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the direct convolution kernel. */ +class ClDirectConv2dKernel : public IClKernel +{ +public: + ClDirectConv2dKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClDirectConv2dKernel); + /** Set the src, weights, biases and dst tensors info. + * + * @note: Due to set_valid_region(), thus src/weights/biases cannot be const. Need to change this once the set_valid_region() is removed. + * + * @note: DirectConvolution only works in the following configurations: + * 1x1 convolution with stride_x = 1/2/3, stride_y = 1/2/3 + * 3x3 convolution with stride_x = 1/2, stride_y = 1/2 + * 5x5 convolution with stride_x = 1/2, stride_y = 1/2 + * 9x9 convolution with stride_x = 1/2, stride_y = 1/2 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info to convolve. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * The 3rd dimension must be the same as the src's volume 3rd dimension. + * Data type supported:Same as @p src. + * @param[in] biases Biases tensor info. Biases are 1D tensor with dimension [OFM]. + * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type + * @param[out] dst Output tensor info. + * The 3rd dimensions must be equal to the 4th dimension of the @p kernels tensor. Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClDirectConv2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const GPUTarget target); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +public: + DataLayout _data_layout{}; + BorderSize _border_size{}; + PadStrideInfo _conv_info{}; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp new file mode 100644 index 0000000000..65cd052995 --- /dev/null +++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp @@ -0,0 +1,525 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "src/common/utils/Validate.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" +#include <map> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +constexpr unsigned int vector_size_byte_opencl = 16; + +std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = +{ + { ArithmeticOperation::ADD, "ADD" }, + { ArithmeticOperation::SUB, "SUB" }, + { ArithmeticOperation::DIV, "DIV" }, + { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" }, + { ArithmeticOperation::MIN, "MIN" }, + { ArithmeticOperation::MAX, "MAX" }, + { ArithmeticOperation::POWER, "POWER" }, + { ArithmeticOperation::PRELU, "PRELU" }, +}; + +std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = +{ + { ArithmeticOperation::ADD, "ADD" }, + { ArithmeticOperation::SUB, "SUB" }, +}; + +std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +{ + std::string config_id; + // Set config_id for enabling LWS tuning + config_id = kernel_name; + config_id += "_"; + config_id += lower_string(string_from_data_type(src1.data_type())); + config_id += "_"; + config_id += support::cpp11::to_string(dst.dimension(0)); + config_id += "_"; + config_id += support::cpp11::to_string(dst.dimension(1)); + return config_id; +} + +Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape) +{ + if(in_place) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + } + return Status{}; +} + +Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2); + + // Check whether it is in_place calculation + const bool in_place = (&src1 == &dst) || (&src2 == &dst); + const bool src1_in_place = in_place && (&src1 == &dst); + + const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); + } + + return Status{}; +} + +Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::F16, DataType::F32, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + + const TensorShape out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape)); + } + + return Status{}; +} + +Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2); + + if(is_data_type_quantized_symmetric(src1.data_type())) + { + const int32_t in1_offset = src1.quantization_info().uniform().offset; + const int32_t in2_offset = src2.quantization_info().uniform().offset; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in1_offset != 0, "For quantized symmetric, offset must be zero"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(in2_offset != 0, "For quantized symmetric, offset must be zero"); + } + + // Check whether it is in_place calculation + const bool in_place = (&src1 == &dst) || (&src2 == &dst); + const bool src1_in_place = in_place && (&src1 == &dst); + + const TensorShape out_shape = TensorShape::broadcast_shape(src1.tensor_shape(), src2.tensor_shape()); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); + + if(is_data_type_quantized_symmetric(dst.data_type())) + { + const int32_t offset = dst.quantization_info().uniform().offset; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero"); + } + } + return Status{}; +} + +CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string) +{ + CLBuildOptions build_opts; + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type())); + build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DOP=" + operation_string); + if(is_data_type_quantized(src1.data_type())) + { + const UniformQuantizationInfo iq1info = src1.quantization_info().uniform(); + const UniformQuantizationInfo iq2info = src2.quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst.quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + support::cpp11::to_string(iq1info.offset)); + build_opts.add_option("-DOFFSET_IN2=" + support::cpp11::to_string(iq2info.offset)); + build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1info.scale)); + build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale)); + } + build_opts.add_option_if(src1.data_type() == DataType::S32, "-DS32"); + + // Check whether it is in_place calculation + const bool in_place = (&src1 == &dst) || (&src2 == &dst); + const bool src1_in_place = in_place && (&src1 == &dst); + build_opts.add_option_if(in_place, "-DIN_PLACE"); + build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE"); + + return build_opts; +} + +std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst) +{ + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); + Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration)); + return std::make_pair(Status{}, win); +} + +std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +{ + const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const TensorShape &out_shape = broadcast_pair.first; + + auto_init_if_empty(dst, out_shape, 1, src1.data_type()); + + return configure_window_arithmetic_common(dst); +} + +std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +{ + const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const TensorShape &out_shape = broadcast_pair.first; + + set_shape_if_empty(dst, out_shape); + set_data_type_if_unknown(dst, DataType::U8); + + return configure_window_arithmetic_common(dst); +} + +std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +{ + const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const TensorShape &out_shape = broadcast_pair.first; + + auto_init_if_empty(dst, out_shape, 1, src1.data_type()); + + return configure_window_arithmetic_common(dst); +} +} // namespace + +ClElementwiseKernel::ClElementwiseKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +{ + // Configure kernel window + auto win_config = validate_and_configure_window(*src1, *src2, *dst); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + + std::string kernel_name = "elementwise_operation_" + name(); + if(is_data_type_quantized(src1->data_type())) + { + kernel_name += "_quantized"; + } + + // Set kernel build options + CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst); + if(_act_info.enabled()) + { + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation()))); + build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a())); + build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(_act_info.b())); + } + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + ICLKernel::configure_internal(win_config.second); + + _config_id = generate_id_for_tuning(kernel_name, *src1, *dst); +} + +void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); + + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); + const TensorShape &in_shape2 = src_1->info()->tensor_shape(); + const TensorShape &out_shape = dst->info()->tensor_shape(); + + bool can_collapse = true; + const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; + if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) + { + can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + + const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_src1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_src2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + // Check whether it is in_place calculation + const bool in_place = (src_0 == dst) || (src_1 == dst); + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src_0, slice_src1); + add_3D_tensor_argument(idx, src_1, slice_src2); + if(!in_place) + { + add_3D_tensor_argument(idx, dst, slice); + } + + enqueue(queue, *this, slice, lws_hint()); + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1)); + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2)); + } + while(collapsed.slide_window_slice_3D(slice)); +} + +/** Logical binary */ + +void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst)); + _op = op; + configure_common(compile_context, src1, src2, dst); +} + +Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); + + return Status{}; +} + +std::string ClLogicalBinaryKernel::name() +{ + switch(_op) + { + case LogicalOperation::And: + return "AND"; + case LogicalOperation::Or: + return "OR"; + case LogicalOperation::Not: + /* fall through */ + default: + ARM_COMPUTE_ASSERT(true); + } + return ""; +} + +std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +{ + return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst); +} + +CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +{ + // The arithmetic utility functions can be share + return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); +} + +std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +{ + return generate_id_for_tuning_common(kernel_name, src1, dst); +} + +/** Arithmetic operations with saturation*/ +void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, + const ConvertPolicy &policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info)); + auto padding_info = get_padding_info({ input1, input2, output }); + + _policy = policy; + _op = op; + _act_info = act_info; + configure_common(compile_context, input1, input2, output); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(op, policy); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first); + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type())); + + return Status{}; +} + +std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) +{ + return validate_and_configure_window_for_arithmetic_operators(input1, input2, output); +} + +CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) +{ + const bool has_float_out = is_data_type_float(output.data_type()); + auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name()); + build_options.add_option((_policy == ConvertPolicy::WRAP || has_float_out) ? "-DWRAP" : "-DSATURATE"); + return build_options; +} + +std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) +{ + auto config_id = generate_id_for_tuning_common(kernel_name, input1, output); + config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_"; + config_id += lower_string(string_from_data_layout(input1.data_layout())); + return config_id; +} + +std::string ClSaturatedArithmeticKernel::name() +{ + return supported_sat_arithmetic_ops[_op]; +} + +/** Arithmetic operations*/ +void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info)); + auto padding_info = get_padding_info({ src1, src2, dst }); + + _op = op; + _act_info = act_info; + configure_common(compile_context, src1, src2, dst); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); + if(op == ArithmeticOperation::DIV) + { + // Partial integer support S32/F32/F16 + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); + } + else if(op == ArithmeticOperation::POWER) + { + // Power operators doesn't support integer arithmetic + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); + } + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); + + return Status{}; +} +std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +{ + if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER) + { + // Division and Power operators don't support integer arithmetic + return validate_and_configure_window_for_division(src1, src2, dst); + } + else + { + return validate_and_configure_window_for_arithmetic_operators(src1, src2, dst); + } +} + +CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +{ + return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); +} +std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +{ + return generate_id_for_tuning_common(kernel_name, src1, dst); +} + +std::string ClArithmeticKernel::name() +{ + return supported_arithmetic_ops[_op]; +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h new file mode 100644 index 0000000000..3783e1571c --- /dev/null +++ b/src/gpu/cl/kernels/ClElementwiseKernel.h @@ -0,0 +1,200 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H +#define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H + +#include "src/core/KernelTypes.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for an element-wise operation kernel + * + * Element-wise operation is computed by: + * @f[ dst(x,y) = OP(src1(x,y), src2(x,y))@f] + * + * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead. + * + */ +class ClElementwiseKernel : public IClKernel +{ +public: + ClElementwiseKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementwiseKernel); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; + +protected: + /** The name of the operation */ + virtual std::string name() = 0; + + /** Configure kernel for a given list of arguments + * + * @param[in] src1 First source tensor info. Data types supported: U8/S8/QASYMM8/QASYMM8_SIGNED/U16/S16/F16/U32/S32/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[in] dst Destination tensor info. Data types supported: same as @p src1. + * + * @return a pair of Status and Window + */ + virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0; + + /** Generate the build options for the specific kernel + * + * @reutrn a CLBuildOptions struct + */ + virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0; + + /** Generate the identifier for tuning + * + * @reutrn a string + */ + virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0; + + /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff) + * + */ + void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + + ActivationLayerInfo _act_info{}; +}; + +class ClLogicalBinaryKernel : public ClElementwiseKernel +{ +public: + ClLogicalBinaryKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogicalBinaryKernel); + /** Function to configure kernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] op Logical binary operation to be executed. + * @param[in] src1 First source tensor info. Data types supported: U8. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[in] dst Destination tensor info. Data types supported: same as @p src1. + */ + void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClLogicalBinaryKernel::configure() + * + * @return a status + */ + static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); + +private: + // Inherited methods overridden: + std::string name() override; + std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; + CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; + std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; + + LogicalOperation _op{ LogicalOperation::Unknown }; +}; + +/** Addition operation */ +class ClSaturatedArithmeticKernel : public ClElementwiseKernel +{ +public: + ClSaturatedArithmeticKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClSaturatedArithmeticKernel); + /** Static function to check if given info will lead to a valid configuration of @ref ClSaturatedArithmeticKernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] op Arithmetic operation to be executed. + * @param[in] input1 First tensor input info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] input2 Second tensor input info. Data types supported: Same as @p input1. + * @param[in] output Output tensor info. Data types supported: Same as @p input1. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClSaturatedArithmeticKernel::configure() + * + * @return a status + */ + static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + +protected: + // Inherited methods overridden: + std::string name() override; + std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override; + CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override; + std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override; + +private: + ConvertPolicy _policy{}; + ArithmeticOperation _op{}; +}; + +class ClArithmeticKernel : public ClElementwiseKernel +{ +public: + ClArithmeticKernel() = default; + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClArithmeticKernel); + + /** Static function to check if given info will lead to a valid configuration of @ref ClArithmeticKernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] op Arithmetic operation to be executed. + * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/S32/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[in] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClArithmeticKernel::configure() + * + * @return a status + */ + static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + +protected: + // Inherited methods overridden: + std::string name() override; + std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; + CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; + std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; + +private: + ArithmeticOperation _op{}; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp new file mode 100644 index 0000000000..1f09515b86 --- /dev/null +++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); + if(op == ElementWiseUnary::LOGICAL_NOT) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8); + } + else if(op == ElementWiseUnary::NEG) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32); + } + + // Validate in case of configured dst + if(dst.total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); + } + + return Status{}; +} +} // namespace + +ClElementWiseUnaryKernel::ClElementWiseUnaryKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + auto padding_info = get_padding_info({ src, dst }); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op)); + + const std::string kernel_name = "elementwise_unary"; + const int vec_size_x = 16 / dst->element_size(); + const int dst_width_x = dst->tensor_shape().x(); + const bool multi_access_x = (dst_width_x / vec_size_x > 0); + + // Set kernel build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0))); + switch(op) + { + case ElementWiseUnary::RSQRT: + build_opts.add_option("-DOPERATION=rsqrt_op"); + break; + case ElementWiseUnary::EXP: + build_opts.add_option("-DOPERATION=exp_op"); + break; + case ElementWiseUnary::NEG: + build_opts.add_option("-DOPERATION=neg_op"); + break; + case ElementWiseUnary::SIN: + build_opts.add_option("-DOPERATION=sin_op"); + break; + case ElementWiseUnary::ABS: + build_opts.add_option("-DOPERATION=fabs_op"); + break; + case ElementWiseUnary::LOG: + build_opts.add_option("-DOPERATION=natural_log_op"); + break; + case ElementWiseUnary::ROUND: + build_opts.add_option("-DOPERATION=round_op"); + break; + case ElementWiseUnary::LOGICAL_NOT: + build_opts.add_option("-DOPERATION=logical_not_op"); + break; + default: + ARM_COMPUTE_ERROR("Not implemented"); + } + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*dst); + if(multi_access_x) + { + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + } + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClElementWiseUnaryKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op) +{ + ARM_COMPUTE_UNUSED(op); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(*src, *dst, op)); + + return Status{}; +} + +void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h new file mode 100644 index 0000000000..0f270f25e8 --- /dev/null +++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H +#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the elementwise unary operator */ +class ClElementWiseUnaryKernel : public IClKernel +{ +public: + ClElementWiseUnaryKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClElementWiseUnaryKernel); + /** Initialise the kernel's srcs, dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src First source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] op Element wise unary operation to perform. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementWiseUnaryKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ElementWiseUnary &op); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp new file mode 100644 index 0000000000..a9345ee334 --- /dev/null +++ b/src/gpu/cl/kernels/ClFillKernel.cpp @@ -0,0 +1,120 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClFillKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +ClFillKernel::ClFillKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); + ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window)); + + const DataType data_type = tensor->data_type(); + const int vec_size_x = 16 / tensor->element_size(); + + // Create and update the window (if needed) + _full_window = calculate_max_window(*tensor); + Window win = _full_window; + if(window != nullptr) + { + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window); + win = *window; + } + + const int output_width_x = win.num_iterations(0); + const bool multi_access_x = output_width_x >= vec_size_x; + const bool remainder_x = output_width_x % vec_size_x > 0; + + if(multi_access_x) + { + win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + } + ICLKernel::configure_internal(win); + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type)); + build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + _kernel = create_kernel(compile_context, "memset", build_opts.options()); +} + +Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window) +{ + ARM_COMPUTE_UNUSED(tensor); + ARM_COMPUTE_UNUSED(constant_value); + if(window != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1); + } + return Status{}; +} + +void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + + // Collapse all the batches on the third + Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, tensor, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h new file mode 100644 index 0000000000..f25cf928ad --- /dev/null +++ b/src/gpu/cl/kernels/ClFillKernel.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FILL_KERNEL_H +#define ARM_COMPUTE_CL_FILL_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for filling the planes of a tensor */ +class ClFillKernel : public IClKernel +{ +public: + ClFillKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFillKernel); + /** Initialise the kernel's tensor and filling value + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Input tensor info. Supported data types: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClFillKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + Window _full_window{}; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FILL_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp new file mode 100644 index 0000000000..1cb78242cb --- /dev/null +++ b/src/gpu/cl/kernels/ClFloorKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClFloorKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + + // Validate in case of configured output + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClFloorKernel::ClFloorKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Auto initialize output + auto_init_if_empty(*dst, src->tensor_shape(), 1, src->data_type()); + + // Validate + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + auto padding_info = get_padding_info({ src, dst }); + + const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); + const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftovers)); + + // Create kernel + _kernel = create_kernel(compile_context, "floor_layer", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(vec_size_x)); + IClKernel::configure_internal(win); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClFloorKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClFloorKernel.h b/src/gpu/cl/kernels/ClFloorKernel.h new file mode 100644 index 0000000000..6e413340ba --- /dev/null +++ b/src/gpu/cl/kernels/ClFloorKernel.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FLOOR_KERNEL_H +#define ARM_COMPUTE_CL_FLOOR_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to perform a floor operation */ +class ClFloorKernel : public IClKernel +{ +public: + ClFloorKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClFloorKernel); + /** Configure kernel for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data type supported: F16/F32. + * @param[out] dst Destination tensor info. Same as @p src + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClFloorKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FLOOR_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp new file mode 100644 index 0000000000..cb03c6255f --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp @@ -0,0 +1,335 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + if(src0->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); + + const int m = gemm_info.m(); + const int n = gemm_info.n(); + const int k = gemm_info.k(); + + ARM_COMPUTE_UNUSED(m); + ARM_COMPUTE_UNUSED(n); + ARM_COMPUTE_UNUSED(k); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k)); + if(gemm_info.reinterpret_input_as_3d()) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m)); + } + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + bool reinterpret_dst_as_3d = (gemm_info.depth_output_gemm3d() != 0); + + Window win{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_dst_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_dst_as_3d) + { + reinterpret_dst_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + + TensorInfo tmp_info(*dst); + + if(reinterpret_dst_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + // RHS matrix still needs padding on the X + AccessWindowStatic src1_access(src1, 0, 0, + ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), + src1->dimension(1)); + + window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + + // We still need padding on the X dimension for the RHS matrix + auto padding_info = get_padding_info({ src0, dst }); + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->info()->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1))); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + std::string kernel_name("gemmlowp_mm_native"); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; + _config_id += "_"; + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k()); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h new file mode 100644 index 0000000000..4b328e0ab8 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h @@ -0,0 +1,81 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices with QASYMM8/QASYMM8_SIGNED data type */ +class ClGemmLowpMatrixMultiplyNativeKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyNativeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyNativeKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Source tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] src1 Source tensor containing the RHS matrix. Data type supported: same as @p src0 + * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * @param[in] rhs_info RHS matrix information used to retrieve the number of columns to be processed by each thread + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp new file mode 100644 index 0000000000..6446b4ce38 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp @@ -0,0 +1,300 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +using namespace misc::shape_calculator; + +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); + ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); + + const int m = gemm_info.m(); + const int n = gemm_info.n(); + const int k = gemm_info.k(); + + TensorShape tensor_shape0{ src0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info, + ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + + TensorInfo tmp_info(*dst); + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + return std::make_pair(Status{}, collapsed); +} +} // namespace + +ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + + _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); + _k = gemm_info.k(); + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + + // Check if we need to slide the matrix B + const unsigned int num_dimensionssrc0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensionssrc0); + + auto padding_info = get_padding_info({ src0, src1, dst }); + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m() : dst->dimension(1); + + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n() % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m())); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n())); + build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + std::string kernel_name("gemmlowp_mm_reshaped_"); + kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; + kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; + _config_id += "_"; + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k()); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.v0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.interleave); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k)); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h new file mode 100644 index 0000000000..a16f500f11 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped + * + * @note The input matrices @p src0 and @p src1 must be reshaped through: + * - @ref opencl::kernels::ClGemmReshapeLhsMatrixKernel + * - @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel + */ +class ClGemmLowpMatrixMultiplyReshapedKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyReshapedKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Source tensor containing the LHS reshaped matrix. Data type supported: QASYMM8/QASYMM8_SIGNED. The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] src1 Source tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. + * @param[out] dst Destination tensor to store the result of matrix multiplication. Data type supported: S32 + * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * lhs_info.transpose: false + * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * rhs_info.transpose: true + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + * + * @note lhs_info.k0 must be equal to rhs_info.k0 + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_output_as_3d{ false }; + unsigned int _k{ 1 }; + bool _use_dummy_work_items{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp new file mode 100644 index 0000000000..bacf07fb4b --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -0,0 +1,544 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include <tuple> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +using namespace misc::shape_calculator; + +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + if(src0->data_type() == DataType::QASYMM8) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + } + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + + const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; + const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; + const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); + + const int m = gemm_info.m; + const int n = gemm_info.n; + const int k = gemm_info.k; + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k)); + if(gemm_info.reinterpret_input_as_3d) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m)); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m)); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + if(output_stage.type == GEMMLowpOutputStageType::NONE) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + } + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), + "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); + + // Checks performed if the dst stage needs to be fused + if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + // If a_offset == 0, vector_sum_col can be a nullptr + if(gemm_info.a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(gemm_info.b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if mm result is a 3D reinterpretation + const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); + + if(expected_dst_shape.num_dimensions() > 1) + { + const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + TensorShape collapsed_dst_shape(expected_dst_shape); + collapsed_dst_shape.collapse_from(dst_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], + "vector_sum_row must have the same number of batches of dst tensor"); + + if(gemm_info.a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); + } + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + + if(output_multipliers != nullptr && output_shifts != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + if(output_stage.is_quantized_per_channel) + { + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); + } + } + } + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) +{ + const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; + + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_output_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); + if(output_stage.type != GEMMLowpOutputStageType::NONE) + { + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); + } + else + { + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(DataType::S32)); + } + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0; + num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + if(gemm_info.a_offset != 0) + { + AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); + window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access); + } + // No access window needed for vector_sum_row + ARM_COMPUTE_UNUSED(vector_sum_row); + + if(bias != nullptr) + { + AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); + window_changed = window_changed || update_window_and_padding(win_out, bias_access); + } + + if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) + { + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); + window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); + } + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, + ITensorInfo *output_multipliers, ITensorInfo *output_shifts) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + + auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); + const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; + const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; + const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; + const int32_t a_offset = gemm_info.a_offset; + const int32_t b_offset = gemm_info.b_offset; + + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + _reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d != 0); + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _is_quantized_per_channel = output_stage.is_quantized_per_channel; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % internal_m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(src0->data_type())); + + std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); + kernel_name += rhs_info.transpose ? "t" : "nt"; + + if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + kernel_name += "_fused_output_stage_fixedpoint"; + _fuse_output_stage = true; + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0 && vector_sum_col != nullptr) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * src0->dimension(0))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); + build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); + build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + + const int min = output_stage.gemmlowp_min_bound; + const int max = output_stage.gemmlowp_max_bound; + + PixelValue min_val{}; + PixelValue max_val{}; + std::tie(min_val, max_val) = get_min_max(dst->data_type()); + build_opts.add_option_if(min != min_val.get<int32_t>(), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if(max != max_val.get<int32_t>(), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + } + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += dot8_supported(CLKernelLibrary::get().get_device()) ? "_dot8" : ""; + _config_id += "_"; + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + dst->clone().get(), + gemm_info, + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + idx++; + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + idx++; + } + + if(_fuse_output_stage) + { + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); + } + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h new file mode 100644 index 0000000000..a77604db7c --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h @@ -0,0 +1,100 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices with QASYMM8 data type when only the input matrix RHS (src1) has been reshaped + * + * @note The input matrix src1 must be reshaped through @ref opencl::kernels::ClGemmReshapeRhsMatrixKernel + * @note For fused output stage, only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT type is supported + */ +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel : public IClKernel +{ +public: + ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0 + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/S32. + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices, output stage information and RHS/LHS info. + * Only the following values are supported for LHS info: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * Only the following values are supported for RHS info: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same as lhs_info.k0 + * rhs_info.transpose: true + * @param[in] vector_sum_col (Optional) Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: S32 + * @param[in] vector_sum_row (Optional) Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: S32 + * @param[in] bias (Optional) Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: S32. + * @param[in] output_multipliers (Optional) Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32. + * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, + ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, + const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _is_quantized_per_channel{ false }; + bool _fuse_output_stage{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp new file mode 100644 index 0000000000..5d2561d0dc --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp @@ -0,0 +1,212 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = mm_result->tensor_shape(); + if(output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + return Status{}; +} +} // namespace + +ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t k, int32_t a_offset, int32_t b_offset) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); + + auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->num_dimensions() > 1 + && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + std::string kernel_name("gemmlowp_offset_contribution"); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); + IClKernel::configure_internal(win); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name + "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(2)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t a_offset, int32_t b_offset) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); + return Status{}; +} + +void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); + + const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST)); + + Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, mm_result, slice); + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h new file mode 100644 index 0000000000..48926e280b --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h @@ -0,0 +1,86 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to add the offset contribution after the matrix multiplication. The computation is performed in-place + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), + * and adds to it the offset contribution of matrix A and matrix B in-place. + * + * The final result is: + * + * mm_result[i][k] = mm_result[i][k] + + * (vector_sum_col[k] * a_offset) + + * (vector_sum_row[i] * b_offset) + + * (a_offset * b_offset * k) + * + */ +class ClGemmLowpOffsetContributionKernel : public IClKernel +{ +public: + ClGemmLowpOffsetContributionKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p input. + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + */ + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + int32_t k, int32_t a_offset, int32_t b_offset); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpOffsetContributionKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp new file mode 100644 index 0000000000..a8a8207504 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp @@ -0,0 +1,263 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, + int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != bias->dimension(0)); + } + + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); + if(output_stage.is_quantized_per_channel) + { + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0)); + } + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); + } + + // If b_offset == 0, vector_sum_row can be a nullptr + if(b_offset != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Validate input + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); + + TensorShape output_shape = mm_result->tensor_shape(); + if(output_shape.num_dimensions() > 1) + { + const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; + + TensorShape vector_sum_row_shape = vector_sum_row->tensor_shape(); + vector_sum_row_shape.collapse_from(1); + output_shape.collapse_from(output_batch_idx); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], + "mm_result tensor must have the same number of batches of output tensor"); + + if(a_offset != 0) + { + TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); + vector_sum_col_shape.collapse_from(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + } + } + } + + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); + // Checks performed when output is configured + if((dst != nullptr) && (dst->total_size() != 0)) + { + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(mm_result, dst); + } + + ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect"); + + return Status{}; +} +} // namespace + +ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutputStageKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, + int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + + auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts }); + + const int min = output_stage.gemmlowp_min_bound; + const int max = output_stage.gemmlowp_max_bound; + + _is_quantized_per_channel = output_stage.is_quantized_per_channel; + + // Check if input is a 3D reinterpretation + const bool reinterpret_as_3d = vector_sum_row != nullptr + && mm_result->num_dimensions() > 1 + && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + + // Auto initialize the output + auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + + // If a_offset == 0, vector_sum_col can be a nullptr + if(a_offset != 0) + { + build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); + build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); + } + // If b_offset == 0, vector_sum_row can be a nullptr + build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); + build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); + build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); + build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); + build_opts.add_option_if(_is_quantized_per_channel, "-DPER_CHANNEL_QUANTIZATION"); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + + PixelValue min_val{}; + PixelValue max_val{}; + std::tie(min_val, max_val) = get_min_max(dst->data_type()); + build_opts.add_option_if((min > min_val.get<int32_t>()), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < max_val.get<int32_t>()), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + + std::string kernel_name("gemmlowp_offset_contribution"); + kernel_name += "_" + string_from_gemmlowp_output_stage(output_stage.type); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*mm_result, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name + "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mm_result->dimension(2)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, + const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + return Status{}; +} + +void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto mm_result = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Set window for vector_sum_col + Window win_vector_sum_col = slice; + win_vector_sum_col.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Set window for vector_sum_row + Window win_vector_sum_row = slice; + win_vector_sum_row.set(Window::DimX, Window::Dimension(0, 0, 0)); + win_vector_sum_row.set(Window::DimY, Window::Dimension(0, 0, 0)); + win_vector_sum_col.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + Window biases_slice = slice; + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, mm_result, slice); + add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); + add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); + add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); + add_3D_tensor_argument(idx, dst, slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); + add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h new file mode 100644 index 0000000000..cee04473c4 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h @@ -0,0 +1,90 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to add the offset contribution after the matrix multiplication and perform the output stage. + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), adds to it the offset contribution + * of matrix A and matrix B and performs the output stage defined by the output_stage argument + * + * @note For quantized computations the output data type for auto-initialization must be passed as part of the @ref GEMMLowpOutputStageInfo. + */ +class ClGemmLowpOffsetContributionOutputStageKernel : public IClKernel +{ +public: + ClGemmLowpOffsetContributionOutputStageKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpOffsetContributionOutputStageKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mm_result Input tensor containing the result of the matrix multiplication. Data type supported: S32 + * @param[in] vector_sum_col Input row-vector of sums of all the entries in each column of matrix B. + * Note: vector_sum_col can be a nullptr in case a_offset = 0. Data type supported: same as @p mm_result + * @param[in] vector_sum_row Input row-vector of sums of all the entries in each row of matrix A. + * Note: vector_sum_row can be a nullptr in case b_offset = 0. Data type supported: same as @p mm_result + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the addition of biases is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p mm_result. + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] k Number of matrix A columns or Matrix B rows + * @param[in] a_offset Offset to be added to each element of the matrix A. + * @param[in] b_offset Offset to be added to each element of the matrix B. + * @param[in] output_stage GEMMLowp output stage info + * @param[in] output_multipliers Output multipliers tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). + * Supported data types: S32 + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, + int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, + int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _is_quantized_per_channel{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OFFSET_CONTRIBUTION_OUTPUT_STAGE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp new file mode 100644 index 0000000000..c50023c3dd --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); + + return Status{}; +} + +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); + + auto padding_info = get_padding_info({ src, bias, dst }); + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); + + // Set the arguments to pass at compile time + auto min = info->gemmlowp_min_bound; + auto max = info->gemmlowp_max_bound; + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); + build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), + "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), + "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + // Create kernel + const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + auto win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + // Create src window + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Setup bias slice + unsigned int idx1 = num_arguments_per_3D_tensor(); + if(bias != nullptr) + { + Window biases_slice(slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + add_1D_tensor_argument(idx1, bias, biases_slice); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h new file mode 100644 index 0000000000..69b5fc5018 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED/QSYMM16 + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final quantized value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by gemmlowp_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Round to nearest division by a power-of-two using result_shift + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to the proper quantized range and cast to QASYMM8/QASYMM8_SIGNED/QSYMM16. + */ +class ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel : public IClKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. + * @param[in] info Output stage info. Used to pass the quantized output data type + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FIXEDPOINT_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp new file mode 100644 index 0000000000..c5cea3d17d --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp @@ -0,0 +1,160 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) + || info->gemmlowp_min_bound > info->gemmlowp_max_bound); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} // namespace + +ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, info)); + + return Status{}; +} + +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); + + auto padding_info = get_padding_info({ src, bias, dst }); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); + + auto min = info->gemmlowp_min_bound; + auto max = info->gemmlowp_max_bound; + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); + build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option_if((min > 0), "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < 255), "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + // Create kernel + _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down_float", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + // Create input window + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + // Setup bias slice + unsigned int idx1 = num_arguments_per_3D_tensor(); + if(bias != nullptr) + { + Window biases_slice(slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + add_1D_tensor_argument(idx1, bias, biases_slice); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h new file mode 100644 index 0000000000..8eda24d25f --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Compute fixed point multiplication between each entry of input by result_fixedpoint_multiplier + * -# Add bias to final result if bias tensor is not a nullptr + * -# Requantize + * -# Add offset to each result + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values to + * - to the [0..255] range and cast to QASYMM8. + * - to the [-128..127] range and cast to QASYMM8_SIGNED. + */ +class ClGemmLowpQuantizeDownInt32ScaleByFloatKernel : public IClKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleByFloatKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleByFloatKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] info Output stage info. Used to pass the quantized output data type + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_BY_FLOAT_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp new file mode 100644 index 0000000000..5469ea9602 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -0,0 +1,157 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) + || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + + // Check biases if exist + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); + ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + } + + return Status{}; +} +} //namespace + +ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); + + return Status{}; +} + +void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); + + auto padding_info = get_padding_info({ src, bias, dst }); + + // Output auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); + + // Set the arguments to pass at compile time + auto min = output_stage->gemmlowp_min_bound; + auto max = output_stage->gemmlowp_max_bound; + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); + build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); + build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); + build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); + + // Create kernel + _kernel = create_kernel(compile_context, "gemmlowp_output_stage_quantize_down", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_3D(); + + unsigned int idx1 = num_arguments_per_3D_tensor(); + if(bias != nullptr) + { + Window biases_slice(slice); + biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); + biases_slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + add_1D_tensor_argument(idx1, bias, biases_slice); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx1, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h new file mode 100644 index 0000000000..84c5060362 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel used to quantize down the int32 accumulator values of GEMMLowp to QASYMM8/QASYMM8_SIGNED + * + * This kernel takes a final int32 accumulator value (the output of the matrix multiplication), and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * The following computations will be performed by the kernel: + * + * -# Add offset terms to final result + * -# Multiply each entry of result by result_mult_int + * -# Add bias to final result if bias tensor is not a nullptr + * -# Shift the int32 accumulator by result_shift + * -# Clamp the value between the specified min and max bounds + * -# Clamp the resulting int32 values: + * -# -to the [0..255] range and cast to QASYMM8. + * -# -to the [-128..127] range and cast to QASYMM8_SIGNED. + */ +class ClGemmLowpQuantizeDownInt32ScaleKernel : public ICLKernel +{ +public: + ClGemmLowpQuantizeDownInt32ScaleKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmLowpQuantizeDownInt32ScaleKernel); + /** Initialise the kernel's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] output_stage GEMMLowp output stage metadata. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp new file mode 100644 index 0000000000..7f6f5731d8 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp @@ -0,0 +1,219 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/KernelDescriptors.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + } + return Status{}; +} + +Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + } + return Status{}; +} +} // namespace + +IClGemmLowpReductionKernel::IClGemmLowpReductionKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +{ + // Perform validate step + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32); + + auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DCOLS_A=" + support::cpp11::to_string(mtx_a->dimension(0))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_a->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_a->data_type())); + build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); + + const bool is_dot8_supported = dot8_supported(CLKernelLibrary::get().get_device()); + + std::string kernel_name = "gemmlowp_matrix_a_reduction" + std::string(is_dot8_supported ? "_dot8" : ""); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + // This kernel does not need padding + Window win = calculate_max_window(*vector_sum_row, Steps()); + ICLKernel::configure_internal(win); + + _config_id = kernel_name; + _config_id += "_"; + _config_id += support::cpp11::to_string(mtx_a->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mtx_a->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(mtx_a->dimension(2)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); + + return Status{}; +} + +void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); + Window slice_in = collapsed.first_slice_window_2D(); + Window slice_out = collapsed.first_slice_window_2D(); + + // Setup input slice. Its dimensions are increased in the cl kernel. + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice_in); + add_2D_tensor_argument(idx, dst, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + } + while(collapsed.slide_window_slice_2D(slice_out)); +} + +void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32); + + auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0)); + + // Set the arguments to pass at compile time + CLBuildOptions build_opts; + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0))); + build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type())); + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_dot8_acc_type_from_data_type(mtx_b->data_type())); + build_opts.add_option_if(info.mul_by_scalar, "-DSCALAR=" + support::cpp11::to_string(info.scalar)); + + // Create kernel + _kernel = create_kernel(compile_context, "gemmlowp_matrix_b_reduction", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*vector_sum_col, Steps(num_elems_processed_per_iteration)); + IClKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +{ + ARM_COMPUTE_UNUSED(info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); + + return Status{}; +} + +void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); + + Window slice_out = collapsed.first_slice_window_2D(); + Window slice_in = slice_out; + + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice_in); + add_2D_tensor_argument(idx, dst, slice_out); + enqueue(queue, *this, slice_out, lws_hint()); + } + while(collapsed.slide_window_slice_2D(slice_out)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h new file mode 100644 index 0000000000..7119b5fee0 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H +#define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Common interface for all OpenCL reduction kernels */ +class IClGemmLowpReductionKernel : public IClKernel +{ +public: + IClGemmLowpReductionKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmLowpReductionKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. + * @param[out] output Output row-vector of sums of all the entries in each row/col of input tensor. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class ClGemmLowpMatrixAReductionKernel : public IClGemmLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mtx_a Input tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8. + * @param[out] vector_sum_row Output row-vector of sums of all the entries in each row of mtx_a. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; + +/** OpenCL kernel used to compute the row-vectors of sums of all the entries in each column of Matrix B. + * + * @note This stage is needed to handle the offset of matrix product + * https://github.com/google/gemmlowp/blob/master/doc/low-precision.md + */ +class ClGemmLowpMatrixBReductionKernel : public IClGemmLowpReductionKernel +{ +public: + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] mtx_b Input tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM8/QSYMM8_PER_CHANNEL. + * @param[out] vector_sum_col Output row-vector of sums of all the entries in each column of mtx_b. Data type supported: S32 + * @param[in] info Kernel metadata: + * - k Number of matrix columns/rows depending on the type of reduction. + * - is_reshaped True if the matrix has been reshaped. + * - scalar Scalar value to multiply each reduced column/row by. + * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp new file mode 100644 index 0000000000..4e934f0f33 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.cpp @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +inline Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((fp_mixed_precision && (src0->data_type() != DataType::F16)), "Mixed precision floating point is supported only for F16 data"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the matrix A must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the matrix B must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_interleaved_transposed && reshape_info.reinterpret_input_as_3d(), "The input tensor cannot be reinterpreted as 3D if is_interleaved_transposed is true"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 2 && reshape_info.reinterpret_input_as_3d(), "The src1 tensor cannot have more than 2 dimensions if src0 has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((reshape_info.reinterpret_input_as_3d() || reshape_info.depth_output_gemm3d() != 0) && (src2 != nullptr) + && (!reshape_info.broadcast_bias()), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + + if(!is_interleaved_transposed) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != src1->dimension(1)); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int m = reshape_info.reinterpret_input_as_3d() ? src0->dimension(1) * src0->dimension(2) : src0->dimension(1); + const unsigned int n = src1->dimension(0); + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(reshape_info.broadcast_bias()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + } + else + { + GEMMRHSMatrixInfo rhs_info; + GEMMLHSMatrixInfo lhs_info; + const auto m = static_cast<unsigned int>(reshape_info.m()); + const auto n = static_cast<unsigned int>(reshape_info.n()); + const int k = reshape_info.k(); + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + rhs_info.n0 = max_cl_vector_width / src1->element_size(); + rhs_info.k0 = 1; + rhs_info.h0 = mult_transpose1xW_width; + rhs_info.interleave = false; + rhs_info.transpose = false; + lhs_info.m0 = 4; + lhs_info.k0 = 4; + lhs_info.v0 = mult_interleave4x4_height; + lhs_info.interleave = true; + lhs_info.transpose = true; + + TensorShape tensor_shape0{ src0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(reshape_info.broadcast_bias()) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + } + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +inline std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + float beta, bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, + ElementsProcessed &num_elements_processed) +{ + ARM_COMPUTE_UNUSED(beta); + bool window_changed = false; + Window win{}; + Window win_out{}; + + const DataType data_type = src0->data_type(); + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); + bool reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_input_as_3d = false; + reinterpret_output_as_3d = false; + } + + // dst tensor auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, is_interleaved_transposed, reshape_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + if(is_interleaved_transposed) + { + // reinterpret_input_as_3d is not supported if is_interleaved_transposed is set + ARM_COMPUTE_ERROR_ON(reshape_info.reinterpret_input_as_3d()); + + // Configure kernel window + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = 4; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + const int bias_processed_per_iteration_y = reshape_info.broadcast_bias() ? 1 : num_elems_processed_per_iteration_y; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y)); + + window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop + } + } + else // The input tensors have not been reshaped + { + // Special case for 1xN, 2xN, 3xN and 4xN src0 tensor. num_elems_processed_per_iteration_x is set up for the default case. + num_elems_processed_per_iteration_x = max_cl_vector_width / data_size_from_type(data_type); + num_elems_processed_per_iteration_y = std::min(static_cast<int>(dst->dimension(1)), 4); + + // Create kernels according to the architecture, data type and input size. + GPUTarget arch_target = get_arch_from_target(gpu_target); + if(arch_target == GPUTarget::BIFROST && data_type == DataType::F32) + { + num_elems_processed_per_iteration_x = (src1->dimension(0) <= 1000 && src0->num_dimensions() == 1) ? 2 : 4; + } + + // Configure window + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1)); + AccessWindowStatic src1_access(src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, + dst->dimension(0), + dst->dimension(1)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + src2->dimension(1)); + + window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + else + { + window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmMatrixMultiplyKernel::ClGemmMatrixMultiplyKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmMatrixMultiplyKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, + float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, beta, + is_interleaved_transposed, reshape_info, fp_mixed_precision)); + + auto padding_info = is_interleaved_transposed ? get_padding_info({ src0, src1, dst }) : get_padding_info({ src0, dst }); + + _reinterpret_input_as_3d = reshape_info.reinterpret_input_as_3d(); + _reinterpret_output_as_3d = (reshape_info.depth_output_gemm3d() != 0); + _add_bias = src2 != nullptr; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = _reinterpret_input_as_3d ? src0->num_dimensions() - 1 : src0->num_dimensions(); + + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + const DataType data_type = src0->data_type(); + + // Get target architecture + GPUTarget gpu_target = get_target(); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, + gpu_target, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, both will be turned off (false) + // in which case we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) + const unsigned int internal_m = _reinterpret_output_as_3d ? dst->dimension(1) * dst->dimension(2) : dst->dimension(1); + const unsigned int n = dst->dimension(0); + + const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); + const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); + + const unsigned int m0 = num_elements_processed.y(); + const unsigned int n0 = num_elements_processed.x(); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % m0; + const unsigned int partial_store_n0 = n % n0; + + // Create build options + CLBuildOptions build_opts; + + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(reshape_info.broadcast_bias(), "-DBROADCAST_BIAS"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(activation_info.activation()))); + build_opts.add_option_if(activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(activation_info.a())); + build_opts.add_option_if(activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(activation_info.b())); + build_opts.add_option("-DIN1_DIM_X=" + support::cpp11::to_string(src1->dimension(0))); + + const bool is_bifrost = get_arch_from_target(gpu_target) == GPUTarget::BIFROST; + + std::string kernel_name; + if(is_interleaved_transposed) + { + const int mult_transpose1xW_width = reshape_info.mult_transpose1xW_width(); + const int mult_interleave4x4_height = reshape_info.mult_interleave4x4_height(); + + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(src1->dimension(0) / (n0 * mult_transpose1xW_width))); + build_opts.add_option("-DH0=" + support::cpp11::to_string(mult_transpose1xW_width)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(mult_interleave4x4_height)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + if(is_data_type_float(data_type) && is_bifrost) + { + kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; + } + else + { + kernel_name = "gemm_mm_interleaved_transposed_" + lower_string(string_from_data_type(data_type)); + if(fp_mixed_precision && data_type == DataType::F16) + { + // currently wider accumulator is only supported for fp16 kernels. + kernel_name += "_acc32"; + } + } + } + else // The input tensors have not been reshaped + { + build_opts.add_option("-DN=" + support::cpp11::to_string(n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(src0->dimension(0))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(n0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + // Create kernels according to the architecture, data type and input size. + if(is_data_type_float(data_type) && is_bifrost) + { + kernel_name = "gemm_mm_floating_point"; + + if(src0->num_dimensions() != 1) + { + kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost"; + if(fp_mixed_precision && data_type == DataType::F16) + { + // currently wider accumulator is only supported for fp16 kernels. + kernel_name += "_acc32"; + } + } + else if(src1->dimension(0) <= 1000 && data_type == DataType::F32) + { + // The first kernel is optimized for the case of 1000 or less dst elements (e.g. FC8 of AlexNet and VGG-16, and + // FC1 of Inception v3). The second kernel is optimized for the case of greater than 1000 dst elements (e.g. + // FC6 and FC7 of AlexNet and VGG-16). + kernel_name += "_" + lower_string(string_from_data_type(data_type)) + "_bifrost_1000"; + } + + // The work-group size equal to the Bifrost quad size has been proved to be optimal for these kernels + // via exhaustive autotuning over a range of representative layer configurations. + set_lws_hint(cl::NDRange(4)); + } + else // (MIDGARD and F32) or (F16) + { + kernel_name = "gemm_mm_floating_point"; + } + } + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = "gemm_"; + _config_id += (is_interleaved_transposed ? "reshaped_" : ""); + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (reshape_info.broadcast_bias() ? "broadcast_bias_" : ""); + _config_id += (fp_mixed_precision ? "fp_mixed_" : ""); + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(3)); + _config_id += "_"; + _config_id += (is_interleaved_transposed ? support::cpp11::to_string(src1->dimension(0)) : support::cpp11::to_string(src1->dimension(1))); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision, const ActivationLayerInfo &activation_info) +{ + // Note: num_elements_processed will be set in validate_and_configure_window() + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(activation_info); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, beta, is_interleaved_transposed, reshape_info, fp_mixed_precision)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + (src2 != nullptr) ? src2->clone().get() : nullptr, + dst->clone().get(), + beta, + is_interleaved_transposed, + reshape_info, + gpu_target, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const unsigned int num_arguments_bias = _add_bias ? num_arguments_per_2D_tensor() + 1 : 0; + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + num_arguments_bias; + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0) + num_arguments_bias; + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + if(_add_bias) + { + add_2D_tensor_argument(idx, src2, slice); + } + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + if(_add_bias) + { + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2])); + } + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h new file mode 100644 index 0000000000..c16e3279f5 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply two input matrices "A" and "B" and add a martix "C" if provided. All elements of the output matrix will be multiplied by alpha. In case matrix C is passed, it will be added to the previous result. + * For the matrix C, the broadcast addition is supported if the flag "broadcast_bias" is set in the GEMMReshapeInfo object + * + * @note If the input tensors @p src0 and @p src1 have been reshaped respectively with @ref ClGemmReshapeLhsMatrixKernel" and @ref ClGemmReshapeRhsMatrixKernel, + * the flag @p is_interleaved_transposed must be set to true + * + * @attention @p src1 tensor must have at least 2 dimensions (matrix) + */ +class ClGemmMatrixMultiplyKernel : public IClKernel +{ +public: + ClGemmMatrixMultiplyKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyKernel); + /** Initialise the kernel's input, output and alpha + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the Matrix A. Data types supported: F16/F32 + * @param[in] src1 Input tensor containing the Matrix B. Data type supported: same as @p src0 + * @param[in] src2 Input tensor containing the Matrix C (bias). Can be nullptr. Data type supported: same as @p src0 + * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta (Optional) Weight of vector C. Default value is 0. Only beta = 1 is currently supported. + * @param[in] is_interleaved_transposed (Optional) True if input0 and input1 have been reshaped respectively using @ref ClGemmReshapeLhsMatrixKernel and @ref ClGemmReshapeRhsMatrixKernel + * @param[in] reshape_info (Optional) GEMM reshape info. If is_interleaved_transposed = true, this object must contain the information to understand how the matrix A and matrix B have been reshaped + * @param[in] fp_mixed_precision (Optional) Use wider accumulators (32 bit instead of 16 for FP16) to improve accuracy + * @param[in] activation_info (Optional) Activation to apply after the matrix multiplication + * + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta = 0.f, + bool is_interleaved_transposed = true, const GEMMReshapeInfo &reshape_info = GEMMReshapeInfo(), bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + bool is_interleaved_transposed, const GEMMReshapeInfo &reshape_info, GPUTarget gpu_target, bool fp_mixed_precision = false, const ActivationLayerInfo &activation_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +public: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _add_bias{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp new file mode 100644 index 0000000000..448d35353b --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp @@ -0,0 +1,416 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) + && (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native"); + + const unsigned int m = gemm_info.m; + const unsigned int n = gemm_info.n; + const unsigned int k = gemm_info.k; + + ARM_COMPUTE_UNUSED(m); + ARM_COMPUTE_UNUSED(n); + ARM_COMPUTE_UNUSED(k); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k); + if(gemm_info.reinterpret_input_as_3d) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); + } + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(gemm_info.broadcast_bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(reinterpret_input_as_3d == reinterpret_output_as_3d) + { + reinterpret_output_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic src0_access(src0, 0, 0, + src0->dimension(0), + src0->dimension(1)); + AccessWindowStatic src1_access(src1, 0, 0, + ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), + src1->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, + dst->dimension(0), + dst->dimension(1)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + src2->dimension(1)); + + window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + else + { + window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + + auto padding_info = get_padding_info({ src0, dst }); + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _add_bias = src2 != nullptr; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = _reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); + const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + + std::string kernel_name("gemm_mm_native"); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.k0); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + src2 != nullptr ? src2->clone().get() : nullptr, + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor + unsigned int idx0; + if(_add_bias) + { + idx0 = 4 * num_arguments_per_2D_tensor() + 4; + } + else + { + idx0 = 3 * num_arguments_per_2D_tensor() + 3; + } + const unsigned int total_cross_plane_pad = src0->info()->padding().top + src0->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + if(_reinterpret_output_as_3d) + { + // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor + unsigned int idx0; + if(_add_bias) + { + idx0 = 4 * num_arguments_per_2D_tensor() + 4 + (_reinterpret_input_as_3d ? 1 : 0); + } + else + { + idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + } + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + add_2D_tensor_argument(idx, src0, slice); + add_2D_tensor_argument(idx, src1, slice_b); + if(_add_bias) + { + add_2D_tensor_argument(idx, src2, slice); + } + add_2D_tensor_argument(idx, dst, slice); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + if(_add_bias) + { + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2])); + } + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h new file mode 100644 index 0000000000..26dec918cd --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when neither of the input matrices have been reshaped */ +class ClGemmMatrixMultiplyNativeKernel : public IClKernel +{ +public: + ClGemmMatrixMultiplyNativeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyNativeKernel); + /** Initialise the kernel's input and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor for the LHS matrix. Data type supported: F32. The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] src1 Input tensor for the RHS matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. + * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. + * @param[out] dst dst tensor info. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the matrix bias + * @param[in] lhs_info LHS matrix information used to retrieve the number of rows and accumulations to be processed by each thread. Only the following values are supported: + * lhs_info.m0: 1,2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * @param[in] rhs_info RHS matrix information used to retrieve the number of columns and accumulations to be processed by each thread. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.k0: same of lhs_info.k0 + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyNativeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _add_bias{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_NATIVE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp new file mode 100644 index 0000000000..959b3ab21d --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp @@ -0,0 +1,421 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLUtils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include <cstddef> +#include <cstdint> +#include <tuple> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) + && (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type"); + ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); + + const unsigned int m = gemm_info.m; + const unsigned int n = gemm_info.n; + const unsigned int k = gemm_info.k; + + TensorShape tensor_shape0{ src0->tensor_shape() }; + tensor_shape0.set(0, k); + tensor_shape0.set(1, m); + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); + if(gemm_info.broadcast_bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + + const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + const int bias_processed_per_iteration_y = gemm_info.broadcast_bias ? 1 : num_elems_processed_per_iteration_y; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + ceil_to_multiple(src2->dimension(1), bias_processed_per_iteration_y)); + + window_changed = update_window_and_padding(win, src2_access); // window used by the execute_window_loop + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + + auto padding_info = get_padding_info({ src0, dst }); + _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _add_bias = src2 != nullptr; + _export_to_cl_image = rhs_info.export_to_cl_image; + _k = gemm_info.k; + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + const bool enable_mixed_precision = gemm_info.fp_mixed_precision; + const DataType data_type = src0->data_type(); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + const unsigned int partial_store_m0 = internal_m % lhs_info.m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(lhs_info.transpose, "-DLHS_TRANSPOSE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(enable_mixed_precision, "-DMIXED_PRECISION"); + build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); + build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type))); + build_opts.add_option("-DM=" + support::cpp11::to_string(gemm_info.m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + + std::string kernel_name("gemm_mm_reshaped_"); + kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; + kernel_name += rhs_info.transpose ? "rhs_t" : "rhs_nt"; + kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += (enable_mixed_precision ? "mixed_precision_" : ""); + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.v0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.interleave); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + src2 != nullptr ? src2->clone().get() : nullptr, + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; + + cl::Image2D src1_image2d; + + if(_export_to_cl_image) + { + const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; + + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + + // LHS buffer + add_2D_tensor_argument(idx, src0, slice); + + // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) + if(_export_to_cl_image) + { + _kernel.setArg(idx++, src1_image2d); + } + else + { + add_2D_tensor_argument(idx, src1, slice_b); + } + + // Bias buffer (_add_bias == true) + add_2D_tensor_argument_if(_add_bias, idx, src2, slice); + + // dst buffer + add_2D_tensor_argument(idx, dst, slice); + + // K dimension (not used if _export_to_cl_image == true) + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(_k)); + + // LHS stride_z + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); + + // RHS stride_z (not used if _export_to_cl_image == true) + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); + + // Bias stride_z (if _add_bias == true) + if(_add_bias) + { + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2])); + } + + // dst stride_z + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); + + // Cross-plan padding (if _reinterpret_output_as_3d = true) + if(_reinterpret_output_as_3d) + { + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad)); + } + + // Dispatch kernel + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h new file mode 100644 index 0000000000..435a3a67f6 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h @@ -0,0 +1,113 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +#include "arm_compute/core/KernelDescriptors.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when both the input matrices LHS (src0) and RHS (src1) have been reshaped + * + * @note The input matrices @p src0 and @p src1 must be reshaped through: + * - @ref ClGemmReshapeLhsMatrixKernel + * - @ref ClGemmReshapeRhsMatrixKernel + */ +class ClGemmMatrixMultiplyReshapedKernel : public IClKernel +{ +public: + ClGemmMatrixMultiplyReshapedKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedKernel); + /** Initialise the kernel's input and output. + * + * @note The F16 computation also supports mixed precision through the gemm_info.fp_mixed_precision flag. + * Mixed precision combines different floating precisions during the computation, in particular, F32 for the accumulations and F16 for the + * multiplications. i.e. float c = (half)a * (half)b + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement + * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the LHS reshaped matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). The number of dimensions for the LHS matrix must be less or equal than 4 + * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3 + * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. + * @param[out] dst dst tensor to store the result of matrix multiplication. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the matrix bias + * @param[in] lhs_info LHS matrix information used for reshaping the src0 tensor. Only the following values are supported: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * lhs_info.transpose: false + * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.k0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image = true) + * rhs_info.transpose: true + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + * + * @note lhs_info.k0 must be equal to rhs_info.k0 + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _add_bias{ false }; + bool _export_to_cl_image{ false }; + unsigned int _k{ 1 }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_KERNEL_H */
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp new file mode 100644 index 0000000000..149c92b7a9 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -0,0 +1,443 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLUtils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +using ElementsProcessed = Steps; + +Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) + && (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); + + const unsigned int m = gemm_info.m; + const unsigned int n = gemm_info.n; + const unsigned int k = gemm_info.k; + + TensorShape tensor_shape1{ src1->tensor_shape() }; + tensor_shape1.set(0, n); + tensor_shape1.set(1, k); + + if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + { + const unsigned int src2_dim0 = src2->dimension(0); + const unsigned int src2_dim1 = src2->dimension(1); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0); + if(gemm_info.broadcast_bias) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim0 != n || src2_dim1 != m), "Incorrect dimension of bias matrix"); + } + } + + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + + const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); + if(gemm_info.reinterpret_input_as_3d) + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); + + if(dst->total_size() != 0) + { + const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +{ + unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; + unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + + Window win{}; + Window win_out{}; + bool window_changed = false; + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + // This approach should only be used when the input/dst tensors have pad on the y direction + if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) + { + reinterpret_output_as_3d = false; + } + + // dst tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + + TensorInfo tmp_info(*dst); + + if(reinterpret_output_as_3d) + { + // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(dst->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // Configure kernel window + num_elems_processed_per_iteration_x = rhs_info.n0; + num_elems_processed_per_iteration_y = lhs_info.m0; + + win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + if(src2 != nullptr) + { + const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; + + AccessWindowStatic src2_access(src2, 0, 0, + ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + src2->dimension(1)); + + window_changed = update_window_and_padding(win, src2_access); + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win; + const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); + collapsed = win.collapse(win, dimension_to_collapse); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKernel() +{ + _type = CLKernelType::GEMM; +} + +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + + _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; + _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; + _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); + _add_bias = src2 != nullptr; + _export_to_cl_image = rhs_info.export_to_cl_image; + _has_pad_y = gemm_info.has_pad_y; + + auto padding_info = get_padding_info({ src0, src1, dst }); + + // In case both input and dst have to be reinterpreted as 3D tensors, + // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. + if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) + { + _reinterpret_input_as_3d = false; + _reinterpret_output_as_3d = false; + } + + // Check if we need to slide the matrix B + const unsigned int num_dimensions_src0 = src0->num_dimensions(); + _slide_matrix_b = (src1->num_dimensions() >= num_dimensions_src0); + + ElementsProcessed num_elements_processed{}; + + // Configure kernel window + auto win_config = validate_and_configure_window(src0, src1, src2, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true, + // we will dispatch a batched-GEMM to reduce the complexity of the address calculation within the OpenCL kernel. + // This means that the actual m used by the kernel is given by dst->dimension(1) and not by gemm_info.m + const unsigned int internal_m = _reinterpret_output_as_3d ? gemm_info.m : dst->dimension(1); + + // These variables are used only if gemm_info.has_pad_y == true + const unsigned int h_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(1) : src0->dimension(1); + const unsigned int d_gemm_3d = _reinterpret_output_as_3d ? dst->dimension(2) : src0->dimension(2); + + // Shrink M0 to be always <= M (internal_m) to prevent out-of-bounds reads. + // NOTE: This might have implications on heuristics and performance + const unsigned int internal_m0 = std::min(internal_m, lhs_info.m0); + + // Calculate partial (store instead of load) M0 and partial N0 for the partial blocks at the end of a row/column if any. This is to avoid padding. + const unsigned int partial_store_m0 = internal_m % internal_m0; + const unsigned int partial_store_n0 = gemm_info.n % rhs_info.n0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); + build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); + build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); + build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); + build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); + build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); + build_opts.add_option("-DM=" + support::cpp11::to_string(internal_m)); + build_opts.add_option("-DN=" + support::cpp11::to_string(gemm_info.n)); + build_opts.add_option("-DK=" + support::cpp11::to_string(gemm_info.k)); + build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); + build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + if(_has_pad_y) + { + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + } + + std::string kernel_name("gemm_mm_reshaped_only_rhs_"); + kernel_name += rhs_info.transpose ? "t" : "nt"; + kernel_name += rhs_info.export_to_cl_image ? "_texture" : ""; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += (_has_pad_y ? "" : "no_pad_y_"); + _config_id += (_add_bias ? "add_bias_" : ""); + _config_id += (gemm_info.broadcast_bias ? "broadcast_bias_" : ""); + _config_id += (_reinterpret_input_as_3d ? "3di_" : ""); + _config_id += (_reinterpret_output_as_3d ? "3do_" : ""); + _config_id += (gemm_info.activation_info.enabled() ? "fused_activation_" : ""); + _config_id += lower_string(string_from_data_type(src0->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(gemm_info.k); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.n0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.h0); + _config_id += "_"; + _config_id += support::cpp11::to_string(rhs_info.interleave); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +{ + ElementsProcessed num_elements_processed{}; + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), + src1->clone().get(), + src2 != nullptr ? src2->clone().get() : nullptr, + dst->clone().get(), + lhs_info, + rhs_info, + gemm_info, + num_elements_processed) + .first); + + return Status{}; +} + +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); + ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); + + if(src1->info()->num_dimensions() < 3) + { + // The stride_z for matrix B must be zero if we do not slice + ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); + } + + const size_t lhs_idx_batch_size = _reinterpret_input_as_3d && !_has_pad_y ? 3u : 2u; + const size_t rhs_idx_batch_size = 2u; + const size_t bia_idx_batch_size = 2u; + const size_t out_idx_batch_size = _reinterpret_output_as_3d && !_has_pad_y ? 3u : 2u; + + Window slice = window.first_slice_window_3D(); + Window slice_matrix_b = slice; + + slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); + slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); + + // Get cross plane pads + const unsigned int total_cross_plane_pad_lhs = src0->info()->padding().top + src0->info()->padding().bottom; + const unsigned int total_cross_plane_pad_out = dst->info()->padding().top + dst->info()->padding().bottom; + + // The execution should fail if we try to run with has_pad_y = false but we have padding in either the LHS or DST tensor + ARM_COMPUTE_ERROR_ON(!_has_pad_y && ((total_cross_plane_pad_lhs != 0) || (total_cross_plane_pad_out != 0))); + + cl::Image2D src1_image2d; + + if(_export_to_cl_image) + { + const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; + + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch); + } + + do + { + Window slice_b = slice; + // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 + // This scenario can happen when the matrix multiplication is used to perform a convolution operation + if(!_slide_matrix_b) + { + slice_b = slice_matrix_b; + } + + unsigned int idx = 0; + + // LHS buffer + add_2D_tensor_argument(idx, src0, slice); + + // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) + if(_export_to_cl_image) + { + _kernel.setArg(idx++, src1_image2d); + } + else + { + add_2D_tensor_argument(idx, src1, slice_b); + } + + // Bias buffer (_add_bias == true) + add_2D_tensor_argument_if(_add_bias, idx, src2, slice); + + // dst buffer + add_2D_tensor_argument(idx, dst, slice); + + // LHS stride_z + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[lhs_idx_batch_size])); + + // RHS stride_z (not used if _export_to_cl_image == true) + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size])); + + // Bias stride_z (if _add_bias == true) + if(_add_bias) + { + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size])); + } + + // dst stride_z + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size])); + + // Cross-plan padding (if _reinterpret_input_as_3d = true) + if(_reinterpret_input_as_3d && _has_pad_y) + { + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs)); + } + + // Cross-plan padding (if reinterpret_output_as_3d = true) + if(_reinterpret_output_as_3d && _has_pad_y) + { + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out)); + } + + enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h new file mode 100644 index 0000000000..3be96d3add --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h @@ -0,0 +1,104 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +#include "arm_compute/core/KernelDescriptors.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to multiply matrices when only the input matrix RHS (src1) has been reshaped + * + * @note The input matrix src1 must be reshaped through @ref ClGemmReshapeRhsMatrixKernel + */ +class ClGemmMatrixMultiplyReshapedOnlyRhsKernel : public ICLKernel +{ +public: + ClGemmMatrixMultiplyReshapedOnlyRhsKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmMatrixMultiplyReshapedOnlyRhsKernel); + /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will fetch the RHS data using the OpenCL read_image built-in function. + * Reading from the OpenCL image object can increase the performance. However, since the OpenCL image object is created importing the OpenCL buffer, + * the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# The stride Y for the src1 should satisfy the OpenCL pitch alignment requirement + * -# src1 width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# src1 (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * + * @param[in] compile_context The compile context to be used. + * @param[in] src0 Input tensor containing the LHS matrix. Data type supported: F16/F32 (only F32 if rhs_info.export_to_cl_image = true). + * The number of dimensions for the LHS matrix must be less or equal than 4. + * @param[in] src1 Input tensor containing the RHS reshaped matrix. Data type supported: same as @p src0. The number of dimensions for the RHS matrix must be less or equal than 3. + * @param[in] src2 Input tensor containing the bias matrix. Data type supported: same as @p src0. + * @param[out] dst Output tensor to store the result of matrix multiplication. Data type supported: same as @p src0 + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of the matrix bias + * @param[in] lhs_info LHS matrix information used to retrieve the number of rows to be processed by each thread. Only the following values are supported: + * lhs_info.m0: 1,2,3,4,5,6,7,8 + * @param[in] rhs_info RHS matrix information used for reshaping the src1 tensor. Only the following values are supported: + * rhs_info.k0: 2,3,4,8,16 + * rhs_info.n0: 2,3,4,8,16 + * rhs_info.transpose: true,false + * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices + */ + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _slide_matrix_b{ true }; + bool _reinterpret_input_as_3d{ false }; + bool _reinterpret_output_as_3d{ false }; + bool _use_dummy_work_items{ false }; + bool _add_bias{ false }; + bool _export_to_cl_image{ false }; + bool _has_pad_y{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp new file mode 100644 index 0000000000..4a01c77d0a --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp @@ -0,0 +1,224 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); + + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), + misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0; + const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0; + bool window_changed = false; + + TensorInfo tmp_info(*src); + + if(reinterpret_input_as_3d) + { + // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave, + // the window needs to be constructed on the 2D collapsed version of the tensor + TensorShape tmp_shape(src->tensor_shape()); + tmp_shape.collapse(2U, 1U); + tmp_info.set_tensor_shape(tmp_shape); + } + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d))); + + // Configure window + Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win_in = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowStatic src_access(src, 0, 0, + src->dimension(0), + src->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1)); + + window_changed = update_window_and_padding(win_in, src_access) || // window used by the execute_window_loop + update_window_and_padding(win, dst_access); // window used to update the padding requirements of dst tensor + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win.collapse(win, Window::DimZ); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); + + auto padding_info = get_padding_info({ src }); + + _reinterpret_input_as_3d = reinterpret_input_as_3d; + + const unsigned int src_w = src->dimension(0); + const unsigned int src_h = _reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1); + const unsigned int partial_load_m0 = src_h % lhs_info.m0; + const unsigned int partial_load_k0 = src_w % lhs_info.k0; + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); + build_opts.add_option("-DV0=" + support::cpp11::to_string(lhs_info.v0)); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_w)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_h)); + build_opts.add_option_if(lhs_info.interleave, "-DINTERLEAVE"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(src->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); + build_opts.add_option("-DPARTIAL_LOAD_M0=" + support::cpp11::to_string(partial_load_m0)); + build_opts.add_option("-DPARTIAL_LOAD_K0=" + support::cpp11::to_string(partial_load_k0)); + + std::string kernel_name("gemm_reshape_lhs_matrix_"); + kernel_name += lhs_info.transpose ? "t" : "nt"; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, lhs_info, reinterpret_input_as_3d); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = "gemm_reshape_lhs_matrix_"; + _config_id += (_reinterpret_input_as_3d ? "3d_" : ""); + _config_id += lower_string(string_from_data_type(src->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.m0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.k0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.v0); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.interleave); + _config_id += "_"; + _config_id += support::cpp11::to_string(lhs_info.transpose); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), lhs_info, reinterpret_input_as_3d).first); + + return Status{}; +} + +void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + Window slice = window.first_slice_window_3D(); + + if(_reinterpret_input_as_3d) + { + // Pass bottom paddings to the kernel if the src has to be reinterpreted as 3D tensor + const unsigned int idx0 = 2 * num_arguments_per_3D_tensor(); + const unsigned int total_cross_plane_pad = src->info()->padding().top + src->info()->padding().bottom; + _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); + } + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h new file mode 100644 index 0000000000..69ec8f04f0 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h @@ -0,0 +1,78 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to reshape the LHS matrix when performing the matrix multiplication. + * In particular, this function splits the src matrix in blocks of size M0xK0 (defined through GEMMLHSInfo) and + * stores each one in the dst matrix unrolling the values + */ +class ClGemmReshapeLhsMatrixKernel : public ICLKernel +{ +public: + ClGemmReshapeLhsMatrixKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeLhsMatrixKernel); + /** Initialise the kernel's input and output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Input tensor. Data types supported: All + * @param[out] dst Output tensor. Data type supported: same as @p src + * @param[in] lhs_info LHS matrix information to be used for reshaping. This object contains all the necessary + * information to reshape the src tensor. Only the following values are supported: + * lhs_info.m0: 2,3,4,5,6,7,8 + * lhs_info.k0: 2,3,4,8,16 + * lhs_info.v0: greater than 0 + * lhs_info.transpose: true, false + * lhs_info.interleave: true, false + * @param[in] reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + bool _reinterpret_input_as_3d{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp new file mode 100644 index 0000000000..778b9b9fa2 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); + ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose)); + + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(rhs_info.export_to_cl_image) + { + const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type()); + ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info)); + } + + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0; + const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0; + bool window_changed = false; + + // dst auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info))); + + // Configure window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + + AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + + window_changed = update_window_and_padding(win, src_access); + + if(rhs_info.export_to_cl_image) + { + gemm::update_padding_for_cl_image(dst); + } + + // Collapse along the Z direction + // This collapse needs to be here in order to tune the Z dimension of LWS + Window collapsed = win.collapse(win, Window::DimZ); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, collapsed); +} +} // namespace + +ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, rhs_info)); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); + build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); + build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); + build_opts.add_option_if(rhs_info.transpose, "-DTRANSPOSE"); + build_opts.add_option_if(rhs_info.interleave, "-DINTERLEAVE"); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); + + std::string kernel_name("gemm_reshape_rhs_matrix_"); + kernel_name += rhs_info.transpose ? "t" : "nt"; + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, rhs_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + ICLKernel::configure_internal(win_config.second); +} + +Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first); + + return Status{}; +} + +void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + Window slice = window.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h new file mode 100644 index 0000000000..31eaa46e02 --- /dev/null +++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h @@ -0,0 +1,84 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H +#define ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to reshape the RHS matrix when performing the matrix multiplication + * In particular, this kernel splits the src matrix in blocks of size K0xN0 and stores each one in + * the dst matrix unrolling the values */ +class ClGemmReshapeRhsMatrixKernel : public ICLKernel +{ +public: + ClGemmReshapeRhsMatrixKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClGemmReshapeRhsMatrixKernel); + /** Initialise the kernel's input and output. + * + * @note If rhs_info.export_to_cl_image = true, this OpenCL kernel will guarantee the OpenCL pitch alignment for the output tensor, + * required to create a OpenCL image object from buffer in @ref ClGemmMatrixMultiplyReshapedKernel and in @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel + * Since the OpenCL image object is created importing the OpenCL buffer, the following conditions are required: + * -# rhs_info.n0 can only be 4, 8 and 16 + * -# rhs_info.k0 can only be 4, 8 and 16 + * -# Data type can only be F32, F16 + * -# The platform should support the OpenCL cl_khr_image2d_from_buffer extension + * -# output width should be less or equal to (CL_DEVICE_IMAGE2D_MAX_WIDTH * 4) + * -# output (height * depth) should be less or equal to CL_DEVICE_IMAGE2D_MAX_HEIGHT + * -# The output tensor should be only consumed by @ref ClGemmMatrixMultiplyReshapedKernel or @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Input tensor. Data types supported: All + * @param[out] dst Output tensor. Data type supported: same as @p src + * @param[in] rhs_info RHS matrix information to be used for reshaping. This object contains all the necessary + * information to reshape the src tensor. Only the following values are supported: + * rhs_info.n0: 2,3,4,8,16 (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.k0: 1,2,3,4,8,16 (k0 = 1 only if rhs_info.transpose = false), (only 4, 8 and 16 if rhs_info.export_to_cl_image == true) + * rhs_info.h0: greater than 0 + * rhs_info.transpose: true, false + * rhs_info.interleave: true, false + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp new file mode 100644 index 0000000000..688308098a --- /dev/null +++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp @@ -0,0 +1,132 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); + + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0)); + for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); + } + ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4); + + return Status{}; +} +} // namespace + +ClHeightConcatenateKernel::ClHeightConcatenateKernel() + : _height_offset(0) +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, height_offset, dst)); + return Status{}; +} + +void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst)); + + auto padding_info = get_padding_info({ src, dst }); + + _height_offset = height_offset; + + // Add build options + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, src->dimension(0)); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + + if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + + // Create kernel + _kernel = create_kernel(compile_context, "concatenate_height", build_opts.options()); + // Configure kernel window + + // The window needs to be based on src as we copy all the heights of src + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, window); + add_4D_tensor_argument(idx, dst, window); + enqueue(queue, *this, window, lws_hint()); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h new file mode 100644 index 0000000000..1e544d3025 --- /dev/null +++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H +#define ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the height concatenate kernel. + * The source tensor will be concatenated into the destination tensor. + */ +class ClHeightConcatenateKernel : public IClKernel +{ +public: + ClHeightConcatenateKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClHeightConcatenateKernel); + /** Initialise the kernel's source and destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: All. + * @param[in] height_offset The starting offset on the Y axis for the dst tensor. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClHeightConcatenateKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int height_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; + +private: + unsigned int _height_offset; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_HEIGHT_CONCATENATE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp new file mode 100644 index 0000000000..c42762b99c --- /dev/null +++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp @@ -0,0 +1,431 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClIm2ColKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include <cmath> +#include <tuple> +#include <utility> + +namespace arm_compute +{ +using namespace misc::shape_calculator; +namespace opencl +{ +namespace kernels +{ +namespace +{ +struct Im2ColConfiguration +{ + std::string kernel_name{}; + std::set<std::string> build_options{}; + unsigned int num_elems_processed_per_iteration{}; + bool is_padding_required_nchw{}; +}; + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_groups) +{ + const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias); + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_layout() == DataLayout::NHWC && num_groups > 1); + ARM_COMPUTE_RETURN_ERROR_ON((src->dimension(channel_idx) % num_groups) != 0); + + // Since there's no implicit padding added, check the total input spatial dimensions (with conv paddings) are big enough for the kernel dimensions + const unsigned int width_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + const unsigned total_width = src->dimension(width_idx) + conv_info.pad_left() + conv_info.pad_right(); + const unsigned total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); + ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); + + if(dst->total_size() > 0) + { + const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto initialization if not yet initialized + TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups); + + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape)); + + const DataLayout data_layout = src->data_layout(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int input_width = src->dimension(width_idx); + const unsigned int input_height = src->dimension(height_idx); + + // Configure the execute window based on the selected optimal OpenCL kernel + bool window_changed = false; + Window win; + + if(data_layout == DataLayout::NHWC) + { + win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + } + else + { + if(is_padding_required_nchw) + { + const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); + win = calculate_max_window(*src, + Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); + AccessWindowStatic input_access(src, + -border.left, + -border.top, + ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration), + input_height + border.bottom); + window_changed = window_changed || update_window_and_padding(win, input_access); + } + else + { + // For the generic case, CLIm2ColKernel doesn't need padding (we do not read out-of-bounds elements) so + // update_window_and_padding() can be skipped + win = calculate_max_window(*src, Steps()); + } + } + + // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension + win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start()); + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} + +Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups) +{ + const DataLayout data_layout = src->data_layout(); + const DataType data_type = src->data_type(); + const unsigned int width_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const unsigned int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const unsigned int input_width = src->dimension(width_idx); + const unsigned int input_height = src->dimension(height_idx); + const unsigned int input_channel = src->dimension(channel_idx); + + const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + + // Im2Col configuration + std::string kernel_name = "im2col_generic_"; + CLBuildOptions build_opts; + unsigned int num_elems_processed_per_iteration = 1; + bool is_padding_required_nchw = false; + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src->element_size())); + build_opts.add_option("-DKERNEL_WIDTH=" + support::cpp11::to_string(kernel_dims.width)); + build_opts.add_option("-DKERNEL_HEIGHT=" + support::cpp11::to_string(kernel_dims.height)); + build_opts.add_option("-DCONVOLVED_WIDTH=" + support::cpp11::to_string(convolved_dims.first)); + build_opts.add_option("-DCONVOLVED_HEIGHT=" + support::cpp11::to_string(convolved_dims.second)); + build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(conv_info.stride().first)); + build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(conv_info.stride().second)); + build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); + build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); + build_opts.add_option("-DPAD_RIGHT=" + support::cpp11::to_string(conv_info.pad_right())); + build_opts.add_option("-DPAD_BOTTOM=" + support::cpp11::to_string(conv_info.pad_bottom())); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(input_width)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(input_height)); + build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(input_channel)); + build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); + build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); + build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); + build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0"); + build_opts.add_option_if(has_bias, "-DHAS_BIAS"); + + if(data_layout == DataLayout::NHWC) + { + num_elems_processed_per_iteration = std::min(2U, input_channel); + is_padding_required_nchw = false; + + // Only the 3x3 and 9x9 cases are optimized for NHWC + if(kernel_dims == Size2D(3U, 3U)) + { + kernel_name = "im2col3x3_"; + } + else if(kernel_dims == Size2D(9U, 9U)) + { + kernel_name = "im2col9x9_"; + } + + // Get boundary vector (the first/last vector with potentially a partial vector size) size + // If input_channel is a multiple of num_elems_processed_per_iteration, the boundary vec size is the (full) vector size + // otherwise, the boundary vec size is the (partial) remainder vector size + const unsigned int vec_size = num_elems_processed_per_iteration; + const unsigned int partial_vec_size = input_channel % vec_size; + const unsigned int boundary_vec_size = vec_size - ((vec_size - partial_vec_size) % vec_size); + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vec_size)); + build_opts.add_option("-DBOUNDARY_VECTOR_SIZE=" + support::cpp11::to_string(boundary_vec_size)); + } + else + { + if(dilation == Size2D(1U, 1U)) + { + const bool squared_im2col = kernel_dims.width == kernel_dims.height; + if(squared_im2col) + { + // Check if we can run an optimized im2col for NCHW + switch(kernel_dims.width) + { + case 1: + // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false + if(conv_info.stride().first == 1 && !conv_info.has_padding()) + { + kernel_name = "im2col1x1_stridex1_"; + num_elems_processed_per_iteration = 4; + is_padding_required_nchw = true; + } + break; + case 3: + kernel_name = "im2col3x3_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = true; + break; + case 5: + kernel_name = "im2col5x5_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = true; + break; + case 11: + // Optimized im2col11x11 if pad_x = pad_y = 0 + if(!conv_info.has_padding()) + { + kernel_name = "im2col11x11_padx0_pady0_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = true; + } + break; + default: + kernel_name = "im2col_generic_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = false; + break; + } + } + else if(kernel_dims.width > 1 && !conv_info.has_padding()) + { + kernel_name = "im2col_generic_padx0_pady0_"; + num_elems_processed_per_iteration = 1; + is_padding_required_nchw = false; + + // Optimized im2col is performed using one or more vector operations with the specified vector size + // and a remainder. For example, for 5x5 convolutions, im2col is performed using vectors of size 4 + // and scalars; for 7x7 convolutions, using vectors of size 4 and vectors of size 3. + // Using the vector size of 4 is always safe since OpenCL supports vectors of size 2 and 3. + // Using the vector size of 8, however, may be faster. + // For 2x2 convolutions, use vectors of size 2. (For 3x3 convolutions, im2col_kernel3x3_padx0_pady0 + // is used instead.) + const size_t vector_size = std::min(static_cast<size_t>(4), kernel_dims.width); + const size_t width_mod_vector_size = kernel_dims.width % vector_size; + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + build_opts.add_option("-DWIDTH_MOD_VECTOR_SIZE=" + support::cpp11::to_string(width_mod_vector_size)); + } + } + } + + // Append the data layout to the kernel_name + kernel_name += lower_string(string_from_data_layout(data_layout)); + + Im2ColConfiguration im2col_config; + im2col_config.kernel_name = kernel_name; + im2col_config.build_options = build_opts.options(); + im2col_config.num_elems_processed_per_iteration = num_elems_processed_per_iteration; + im2col_config.is_padding_required_nchw = is_padding_required_nchw; + + return im2col_config; +} +} // namespace + +ClIm2ColKernel::ClIm2ColKernel() + : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, + const Size2D &dilation, + unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); + + auto padding_info = get_padding_info({ src, dst }); + _data_layout = src->data_layout(); + + const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const unsigned int height_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int input_width = src->dimension(width_idx); + const unsigned int input_height = src->dimension(height_idx); + + // Select and configure the optimal OpenCL kernel to run. + // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration + // and the padding requirement flag + Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); + + // Create kernel + _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options); + + _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration; + _kernel_dims = kernel_dims; // Only needed by the Tuner + _conv_info = conv_info; // Only needed by the Tuner + _num_groups = num_groups; + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + im2col_config.is_padding_required_nchw, num_groups); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second); + + // Set config_id for enabling LWS tuning + _config_id = im2col_config.kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(src->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(num_groups); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(_data_layout)); + + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); +} + +Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, + unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); + Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + im2col_config.is_padding_required_nchw, num_groups) + .first); + return Status{}; +} + +void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); + ARM_COMPUTE_ERROR_ON(tensors.empty()); + + // Get initial windows + // Collapse in order to have (SRC_DEPTH * BATCH_SIZE) on the 3rd dimension + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + window_collapsed.set_dimension_step(Window::DimZ, 1); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + Window window_output; + window_output.use_tensor_dimensions(dst->info()->tensor_shape()); + + const Window first_slice_3d = window_collapsed.first_slice_window_3D(); + + Window slice = first_slice_3d; + Window slice_in = first_slice_3d; + Window slice_out = window_output.first_slice_window_2D(); + + if(_data_layout == DataLayout::NHWC) + { + const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); + const int num_batches = tmp_win[3].end(); + + slice.set(1, Window::Dimension(0, static_cast<int>(dst->info()->tensor_shape()[1]), 1)); + slice.set(2, Window::Dimension(0, static_cast<int>(num_batches), 1)); + } + else + { + slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration)); + slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1)); + // Note: In case of NCHW the 3rd dimension is already set collapsing the input window + } + + // Setup input slice + // The dimensions of the input are increased within the OpenCL kernel + slice_in.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_in.set(Window::DimZ, Window::Dimension(0, 0, 0)); + + // Setup output slice + // The dimensions of the output are increased within the OpenCL kernel + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)])); + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice_in); + if(_num_groups == 1) + { + add_2D_tensor_argument(idx, dst, slice_out); + } + else + { + add_3D_tensor_argument(idx, dst, slice_out); + } + enqueue(queue, *this, slice, lws_hint()); + } + while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h new file mode 100644 index 0000000000..a637ad215d --- /dev/null +++ b/src/gpu/cl/kernels/ClIm2ColKernel.h @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_IM2COL_KERNEL_H +#define ARM_COMPUTE_CL_IM2COL_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Size2D.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the im2col reshape kernel. + * + * Rearranges image blocks into columns. It is used to strip out each convolution block to a single column. + * It is used to transform a convolution to a plain matrix multiplication. + * + * For example taking into account the image below and assuming 3x3 image blocks with stride of 1 we have: + * @f[ + * \left( \begin{array}{cccc} + * a00 & a01 & a02 & a03 \\ + * a10 & a11 & a12 & a13 \\ + * a20 & a21 & a22 & a23 \\ + * a30 & a31 & a32 & a33 \\ + * \end{array} \right) + * = + * \left( \begin{array}{ccccccccc} + * a00 & a01 & a02 & a10 & a11 & a12 & a20 & a21 & a22 \\ + * a01 & a02 & a03 & a11 & a12 & a13 & a21 & a22 & a23 \\ + * a10 & a11 & a12 & a20 & a21 & a22 & a30 & a31 & a32 \\ + * a11 & a12 & a13 & a21 & a22 & a23 & a31 & a32 & a33 \\ + * \end{array} \right) + * @f] + */ +class ClIm2ColKernel : public IClKernel +{ +public: + /** Default constructor */ + ClIm2ColKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClIm2ColKernel); + /** Set the input and output of the kernel. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The input tensor info to convert. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[out] dst The output tensor info. First 2 lower dimensions represent a transform of each 3D input, + * while every dimension above represents a batch. Data types supported: Same as @p input + * @param[in] kernel_dims The kernel dimensions (width and height). + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] has_bias In case biases are provided expands the matrix with 1. + * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClIm2ColKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +public: + DataLayout _data_layout; + std::pair<unsigned int, unsigned int> _convolved_dims; + unsigned int _num_elems_processed_per_iteration; + Size2D _kernel_dims; + PadStrideInfo _conv_info; + unsigned int _num_groups; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_IM2COL_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp new file mode 100644 index 0000000000..0bf1932085 --- /dev/null +++ b/src/gpu/cl/kernels/ClMulKernel.cpp @@ -0,0 +1,439 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClMulKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(overflow_policy); + ARM_COMPUTE_UNUSED(rounding_policy); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, + 1, + DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, + 1, + DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, + DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); + + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, + 1, + DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::S16, DataType::QSYMM16, DataType::F16, + DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), + "Dst can only be U8 if both src are U8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), + "Dst can only be QASYMM8 if both src are QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), + "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), + "Dst can only be QSYMM16 if both src are QSYMM16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32), + "Dst must be S32 if source tensors are S32"); + if(in_place) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); + } + } + + return Status{}; +} +} // namespace + +ClMulKernel::ClMulKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, + scale, overflow_policy, rounding_policy, act_info)); + + auto padding_info = get_padding_info({ src1, src2, dst }); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); + + int scale_int = -1; + // Extract sign, exponent and mantissa + int exponent = 0; + float normalized_mantissa = std::frexp(scale, &exponent); + // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 + // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 + // Moreover, it will be negative as we deal with 1/2^n + if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + { + // Store the positive exponent. We know that we compute 1/2^n + // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 + scale_int = std::abs(exponent - 1); + } + + std::string acc_type; + // Check if it has float src and dst + if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) + { + scale_int = -1; + acc_type = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half"; + } + else + { + if(src1->element_size() == 4 || src2->element_size() == 4) + { + // use 64 bit accumulator for 32-bit input + acc_type = "long"; + } + else if(src1->element_size() == 2 || src2->element_size() == 2) + { + // Use 32-bit accumulator for 16-bit input + acc_type = "int"; + } + else + { + // Use 16-bit accumulator for 8-bit input + acc_type = "ushort"; + } + } + + const bool is_quantized = is_data_type_quantized(src1->data_type()); + const unsigned int vec_size = adjust_vec_size(16 / dst->element_size(), dst->dimension(0)); + const unsigned int vec_size_leftover = dst->dimension(0) % vec_size; + + // Set kernel build options + std::string kernel_name = "pixelwise_mul"; + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type())); + build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); + build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); + if(is_quantized && (dst->data_type() != DataType::S32)) + { + const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option_if(is_data_type_quantized_asymmetric(src1->data_type()), + "-DOFFSET_IN1=" + support::cpp11::to_string(iq1_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(src2->data_type()), + "-DOFFSET_IN2=" + support::cpp11::to_string(iq2_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dst->data_type()), + "-DOFFSET_OUT=" + support::cpp11::to_string(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); + build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + kernel_name += "_quantized"; + } + else + { + kernel_name += (scale_int >= 0) ? "_int" : "_float"; + build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE"); + build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte"); + build_opts.add_option("-DACC_DATA_TYPE=" + acc_type); + if(act_info.enabled()) + { + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + } + } + + // Check whether it is in_place calculation + const bool in_place = (src1 == dst) || (src2 == dst); + const bool src1_in_place = in_place && (src1 == dst); + build_opts.add_option_if(in_place, "-DIN_PLACE"); + build_opts.add_option_if(src1_in_place, "-DSRC1_IN_PLACE"); + + // Create kernel + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set scale argument + unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters + + if(scale_int >= 0 && !is_quantized) + { + _kernel.setArg(idx++, scale_int); + } + else + { + _kernel.setArg(idx++, scale); + } + + Window win = calculate_max_window(*dst, Steps(vec_size)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(dst->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); +} + +Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); + + return Status{}; +} + +void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); + + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); + const TensorShape &in_shape2 = src_1->info()->tensor_shape(); + const TensorShape &out_shape = dst->info()->tensor_shape(); + + bool can_collapse = true; + if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + + const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + // Check whether it is in_place calculation + const bool in_place = (src_0 == dst) || (src_1 == dst); + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src_0, slice_input1); + add_3D_tensor_argument(idx, src_1, slice_input2); + if(!in_place) + { + add_3D_tensor_argument(idx, dst, slice); + } + enqueue(queue, *this, slice, lws_hint()); + + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); + } + while(collapsed.slide_window_slice_3D(slice)); +} + +namespace +{ +constexpr unsigned int vec_size_complex = 1; + +Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); + ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); + + // Validate in case of configured dst + if(dst->total_size() > 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + } + + return Status{}; +} +} // namespace + +ClComplexMulKernel::ClComplexMulKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info)); + + auto padding_info = get_padding_info({ src1, src2, dst }); + + const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); + auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); + if(act_info.enabled()) + { + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + } + + // Create kernel + _kernel = create_kernel(compile_context, "pixelwise_mul_complex", build_opts.options()); + + Window win = calculate_max_window(*dst, Steps(vec_size_complex)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info)); + + return Status{}; +} + +void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + const TensorShape &in_shape1 = src_0->info()->tensor_shape(); + const TensorShape &in_shape2 = src_1->info()->tensor_shape(); + const TensorShape &out_shape = dst->info()->tensor_shape(); + + bool can_collapse = true; + if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + { + can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); + for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + { + can_collapse = (in_shape1[d] == in_shape2[d]); + } + } + + bool has_collapsed = false; + Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + + const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; + const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; + + Window slice = collapsed.first_slice_window_3D(); + Window slice_input1 = slice.broadcast_if_dimension_le_one(in_shape1_collapsed); + Window slice_input2 = slice.broadcast_if_dimension_le_one(in_shape2_collapsed); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src_0, slice_input1); + add_3D_tensor_argument(idx, src_1, slice_input2); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); + ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); + } + while(collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h new file mode 100644 index 0000000000..41c862eb03 --- /dev/null +++ b/src/gpu/cl/kernels/ClMulKernel.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_MUL_KERNEL_H +#define ARM_COMPUTE_CL_MUL_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the pixelwise multiplication kernel. + * + * For binary elementwise ops in-place cannot be enabled by passing nullptr to dst, it can only be enabled by passing either src1 or src2 to dst instead. + * +*/ +class ClMulKernel : public IClKernel +{ +public: + ClMulKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClMulKernel); + /** Initialise the kernel's src and dst. + * + * Valid configurations (Input1,Input2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,U8) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * - (QSYMM16,QSYMM16) -> S32 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 + * @param[in] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32/S32 + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; + +/** Interface for the complex pixelwise multiplication kernel. */ +class ClComplexMulKernel : public ICLKernel +{ +public: + ClComplexMulKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClComplexMulKernel); + /** Initialise the kernel's src and dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 An src tensor info. Data types supported: F32. Number of channels supported: 2. + * @param[in] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[out] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClComplexMulKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_MUL_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp new file mode 100644 index 0000000000..8d4655114b --- /dev/null +++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClPermuteKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +TensorShape get_dst_shape(const ITensorInfo *src, const PermutationVector &perm) +{ + TensorShape dst_shape = src->tensor_shape(); + permute(dst_shape, perm); + return dst_shape; +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() < 1 || src->num_dimensions() > 4, + "Permutation up to 4-D src tensor is supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4, + "Permutation vector size should be less than or equal to 4"); + for(const auto &p : perm) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values"); + } + + // Validate configured dst + if(dst->total_size() != 0) + { + const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + return Status{}; +} +} // namespace + +ClPermuteKernel::ClPermuteKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + auto padding_info = get_padding_info({ src, dst }); + const TensorShape dst_shape = get_dst_shape(src, perm); + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, perm)); + + _perm = perm; + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type()))); + build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2))); + // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector + build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0)); + build_opts.add_option("-DP2=" + support::cpp11::to_string((_perm.num_dimensions() >= 2) ? perm[1] : 1)); + build_opts.add_option("-DP3=" + support::cpp11::to_string((_perm.num_dimensions() >= 3) ? perm[2] : 2)); + build_opts.add_option("-DP4=" + support::cpp11::to_string((_perm.num_dimensions() >= 4) ? perm[3] : 3)); + + _kernel = create_kernel(compile_context, "permute", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + + ICLKernel::configure_internal(win); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClPermuteKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, perm)); + + return Status{}; +} + +void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); + + // Setup dst slice + Window slice_out(slice_in); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimZ, Window::Dimension(0, 0, 0)); + slice_out.set(3, Window::Dimension(0, 0, 0)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, slice_in); + add_4D_tensor_argument(idx, dst, slice_out); + enqueue(queue, *this, slice_in, lws_hint()); + } + while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h new file mode 100644 index 0000000000..0d349e739b --- /dev/null +++ b/src/gpu/cl/kernels/ClPermuteKernel.h @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_PERMUTE_KERNEL_H +#define ARM_COMPUTE_CL_PERMUTE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to perform tensor permutation. + * + * Permutes given a permutation vector + */ +class ClPermuteKernel : public IClKernel +{ +public: + ClPermuteKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPermuteKernel); + /** Set the src and dst of the kernel. + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + * @param[in] perm Permutation vector + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClPermuteKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + PermutationVector _perm{}; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_PERMUTE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp new file mode 100644 index 0000000000..04f2b142bd --- /dev/null +++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp @@ -0,0 +1,509 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClPool2dKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +// Internal window config info +using ClPoolingConfig = std::pair<unsigned int, BorderSize>; //num_elems_processed_per_iteration, border_size + +void auto_init(const ITensorInfo *src, ITensorInfo *dst, ITensorInfo *indices, PoolingLayerInfo pool_info) +{ + TensorShape out_shape = compute_pool_shape(*src, pool_info); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape)); + if(indices) + { + auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32)); + } +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), + "Unsupported combination of parameters!"); + + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const bool is_global_pooling = pool_info.is_global_pooling; + unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + int output_width = 0; + int output_height = 0; + std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + pool_size_x, pool_size_y, pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); + + // Check indices + if(indices) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + + if(indices->total_size() != 0) + { + TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info); + } + } + + // Checks performed when dst is configured + if(dst->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); + TensorInfo out_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, dst->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &out_info); + } + + return Status{}; +} + +std::tuple<Status, Window, ClPoolingConfig> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Get data layout + const DataLayout data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + int pool_stride_x = 0; + int pool_stride_y = 0; + unsigned int pooled_w = 0; + unsigned int pooled_h = 0; + int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + const int pool_pad_right = pad_stride_info.pad_right(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + const int pool_pad_bottom = pad_stride_info.pad_bottom(); + BorderSize border_size = BorderSize(); + + auto_init(src, dst, indices, pool_info); + pooled_w = dst->tensor_shape()[idx_width]; + pooled_h = dst->tensor_shape()[idx_height]; + + const DataType data_type = src->data_type(); + + const int src_width = src->dimension(idx_width); + const int src_height = src->dimension(idx_height); + + unsigned int num_elems_processed_per_iteration = 0; + bool window_changed = false; + Window win{}; + switch(data_layout) + { + case DataLayout::NCHW: + { + // Initialize border size + border_size = BorderSize(pool_pad_top, pool_pad_right, pool_pad_bottom, pool_pad_left); + // Change the number of elements processed per iteration + // for pooling 3x3 with stride less equal than 3 + const bool can_optimize = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3) && !is_data_type_quantized(data_type); + num_elems_processed_per_iteration = can_optimize ? 4 : 1; + const unsigned int num_elems_read_per_iteration = (num_elems_processed_per_iteration - 1) * pool_stride_x + pool_size_x; + + // Number of iterations in X dimension + const int num_iterations_x = (pooled_w + num_elems_processed_per_iteration - 1) / num_elems_processed_per_iteration; + + // Upper limit for the number of right/bottom border elements that are accessed + const int upper_bound_w = ((num_iterations_x - 1) * num_elems_processed_per_iteration * pool_stride_x - pool_pad_left + num_elems_read_per_iteration) - src_width; + const int upper_bound_h = ((pooled_h - 1) * pool_stride_y - pool_pad_top + pool_size_y) - src_height; + + border_size.right = std::max(upper_bound_w, pool_pad_right); + border_size.bottom = std::max(upper_bound_h, pool_pad_bottom); + + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + + AccessWindowRectangle src_access(src, -pool_pad_left, -pool_pad_top, num_elems_read_per_iteration, pool_size_y, + pool_stride_x, pool_stride_y); + AccessWindowHorizontal dst_access(dst, 0, num_elems_processed_per_iteration); + + // Update indices window + if(indices) + { + AccessWindowHorizontal indices_access(indices, 0, num_elems_processed_per_iteration); + window_changed = update_window_and_padding(win, src_access, dst_access, indices_access); + indices_access.set_valid_region(win, ValidRegion(Coordinates(), indices->tensor_shape())); + } + else + { + window_changed = update_window_and_padding(win, src_access, dst_access); + } + + dst_access.set_valid_region(win, ValidRegion(Coordinates(), dst->tensor_shape())); + break; + } + case DataLayout::NHWC: + { + const size_t vec_size = dst->data_type() == DataType::F32 ? 2 : 4; + + // Initialize border size + border_size = BorderSize(); + num_elems_processed_per_iteration = adjust_vec_size(vec_size, dst->dimension(0)); + win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + break; + } + default: + ARM_COMPUTE_ERROR("Not implemented"); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_tuple(err, win, ClPoolingConfig(num_elems_processed_per_iteration, border_size)); +} +} // namespace + +ClPool2dKernel::ClPool2dKernel() +{ + _type = CLKernelType::POOL; +} + +BorderSize ClPool2dKernel::border_size() const +{ + return _border_size; +} + +void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + auto padding_info = get_padding_info({ src, dst, indices }); + + // Set instance variables + _pool_info = pool_info; + _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + int pool_stride_x = 0; + int pool_stride_y = 0; + const PoolingType pool_type = pool_info.pool_type; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); + const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const bool exclude_padding = pool_info.exclude_padding; + std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + + // Set build options + CLBuildOptions build_opts; + const DataType data_type = src->data_type(); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst, pool_info, indices); + + ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); + ICLKernel::configure_internal(std::get<1>(win_config)); + + ClPoolingConfig pooling_config = std::get<2>(win_config); + _num_elems_processed_per_iteration = pooling_config.first; + _border_size = pooling_config.second; + + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration)); + + // Tensor paddings are used to calculate the indicies for MAX pooling + if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + { + build_opts.add_option("-DPAD_TENSOR_LEFT=" + support::cpp11::to_string(src->padding().left)); + build_opts.add_option("-DPAD_TENSOR_RIGHT=" + support::cpp11::to_string(src->padding().right)); + build_opts.add_option("-DPAD_TENSOR_TOP=" + support::cpp11::to_string(src->padding().top)); + build_opts.add_option("-DPAD_TENSOR_BOTTOM=" + support::cpp11::to_string(src->padding().bottom)); + build_opts.add_option("-DTENSOR_CHANNEL=" + support::cpp11::to_string(src->dimension(idx_channel))); + build_opts.add_option("-DTENSOR_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); + build_opts.add_option("-DTENSOR_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); + } + + if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq_info.offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq_info.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + + // Check dst dimensions + auto_init(src, dst, indices, pool_info); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices)); + + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); + build_opts.add_option("-DPOOL_" + string_from_pooling_type(pool_type)); + build_opts.add_option("-DSTRIDE_X=" + support::cpp11::to_string(pool_stride_x)); + build_opts.add_option("-DSTRIDE_Y=" + support::cpp11::to_string(pool_stride_y)); + build_opts.add_option("-DPAD_X=" + support::cpp11::to_string(pool_pad_left)); + build_opts.add_option("-DPAD_Y=" + support::cpp11::to_string(pool_pad_top)); + build_opts.add_option("-DPOOL_SIZE_X=" + support::cpp11::to_string(pool_size_x)); + build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y)); + + // Set the initial value for the pooling operation accordingly with the data type + if(pool_type == PoolingType::MAX) + { + if(is_data_type_quantized(data_type)) + { + PixelValue type_min{}; + std::tie(type_min, std::ignore) = get_min_max(data_type); + build_opts.add_option("-DINITIAL_VALUE=" + support::cpp11::to_string(type_min.get<int32_t>())); + } + else + { + build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest())); + } + } + else + { + // Pool AVG and Pool L2 initial value + build_opts.add_option("-DINITIAL_VALUE=0"); + } + + build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); + build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); + + // Create kernel + switch(_data_layout) + { + case DataLayout::NCHW: + { + const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; + const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); + const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : data_type); + build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); + build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); + + if(pool_type != PoolingType::MAX) + { + build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); + } + + if((pool_size_x == 3) && (pool_size_y == 3) && !is_data_type_quantized_asymmetric(data_type)) + { + // Check if we have pool3x3 with stride_x less equal than 3. In these cases, run an optimized OpenCL kernel where + // each thread computes 4 dst elements + const bool is_pool3x3_stride_le3 = (pool_size_x == 3) && (pool_size_y == 3) && (pool_stride_x <= 3); + + std::string kernel_name = ((is_pool3x3_stride_le3) ? "pooling_layer_optimized_" : "pooling_layer_") + + support::cpp11::to_string(pool_size_x); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + { + // For max pooling with pool2x2, store indicies which will be used in max unpooling + if(data_type == DataType::F32) + { + std::string kernel_name = "pooling_layer_2_nchw_indices_fp32"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else if(data_type == DataType::F16) + { + std::string kernel_name = "pooling_layer_2_nchw_indices_fp16"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + } + else // Run general case + { + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nchw" : "pooling_layer_MxN_nchw"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + break; + } + case DataLayout::NHWC: + { + // Floating point mixed precision is support on F16 only + const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; + + // Wider accumulation is required to avoid accuracy loss + // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) + // Cast 2: Quantized (int8/uint8 src data and int32 accumulation ) + DataType acc_data_type = data_type; + + if(use_fp_mixed_precision) + { + acc_data_type = DataType::F32; + } + else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) + { + acc_data_type = DataType::S32; + } + + build_opts.add_option("-DACC_DATA_TYPE=" + get_cl_type_from_data_type(acc_data_type)); + build_opts.add_option_if(use_fp_mixed_precision, "-DFP_MIXED_PRECISION"); + build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); + build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); + build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); + build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); + if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) + { + build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX"); + + std::string kernel_name = "pooling_layer_2x2_nhwc"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + else + { + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + } + break; + } + default: + ARM_COMPUTE_ERROR("Not implemented"); + } + + // Set config_id for enabling LWS tuning + _config_id = "pooling_layer_"; + _config_id += lower_string(string_from_data_type(data_type)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(_data_layout)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(idx_width)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(idx_height)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(idx_channel)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(src->data_layout())); + + ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); +} + +Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices)); + ARM_COMPUTE_RETURN_ON_ERROR(std::get<0>(validate_and_configure_window(src->clone().get(), dst->clone().get(), pool_info))); + + return Status{}; +} + +void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + unsigned int pool_stride_x = 0; + unsigned int pool_stride_y = 0; + std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride(); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0)); + auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1)); + + // Collapse window + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + + switch(_data_layout) + { + case DataLayout::NCHW: + { + Window slice = window_collapsed.first_slice_window_3D(); + do + { + // Upsample src by pool size + Window in_slice(slice); + in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start() - _pool_info.pad_stride_info.pad_left(), + (in_slice.x().end() - _pool_info.pad_stride_info.pad_left()) * pool_stride_x, + pool_stride_x * _num_elems_processed_per_iteration)); + in_slice.set(Window::DimY, Window::Dimension(in_slice.y().start() - _pool_info.pad_stride_info.pad_top(), + (in_slice.y().end() - _pool_info.pad_stride_info.pad_top()) * pool_stride_y, + pool_stride_y)); + + // Set srcs + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, in_slice); + add_3D_tensor_argument(idx, dst, slice); + if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) + { + add_3D_tensor_argument(idx, indices, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + while(window_collapsed.slide_window_slice_3D(slice)); + break; + } + case DataLayout::NHWC: + { + const size_t batch_size = dst->info()->tensor_shape().total_size_upper(3); + + Window slice = window_collapsed.first_slice_window_4D(); + Window in_slice = window_collapsed.first_slice_window_4D(); + in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); + in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); + in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); + in_slice.set(3, Window::Dimension(0, batch_size, 1)); + do + { + // Set srcs + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, in_slice); + add_4D_tensor_argument(idx, dst, slice); + if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) + { + add_4D_tensor_argument(idx, indices, slice); + } + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); + break; + } + default: + ARM_COMPUTE_ERROR("Not implemented"); + } +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h new file mode 100644 index 0000000000..61d204dc68 --- /dev/null +++ b/src/gpu/cl/kernels/ClPool2dKernel.h @@ -0,0 +1,75 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_POOL2D_KERNEL_H +#define ARM_COMPUTE_CL_POOL2D_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the pooling layer kernel */ +class ClPool2dKernel : public IClKernel +{ +public: + ClPool2dKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClPool2dKernel); + + /** Configure kernel for a given list of arguments + * + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. + * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPool2dKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +public: + PoolingLayerInfo _pool_info{}; + DataLayout _data_layout{ DataLayout::UNKNOWN }; + BorderSize _border_size{ 0 }; + unsigned int _num_elems_processed_per_iteration{ 1 }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_POOL2D_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp new file mode 100644 index 0000000000..de222a99b2 --- /dev/null +++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp @@ -0,0 +1,180 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClQuantizeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" + +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" + +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + + // Output must always be initialized + ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); + + return Status{}; +} +} // namespace + +ClQuantizeKernel::ClQuantizeKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + auto padding_info = get_padding_info({ src, dst }); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + const int vec_size_x = 16 / src->element_size(); + const int input_width_x = src->tensor_shape().x(); + const bool multi_access_x = (input_width_x / vec_size_x > 0); + + const UniformQuantizationInfo qinfo = dst->quantization_info().uniform(); + const DataType output_data_type = dst->data_type(); + + float scale_to_apply = qinfo.scale; + int32_t offset_to_apply = qinfo.offset; + if(is_data_type_quantized_asymmetric(src->data_type())) + { + /* + * In case of requantization of a quantized input tensor to an output tensor with another quantization + * instead of of apply dequantization and then a quantization functions, we just compute new scale and + * offset to apply. + * + * Assuming: + * - q_i as input quantized value + * - q_o as output quantized value + * - z_i as input quantization offset value + * - z_o as output quantization offset value + * - s_i as input quantization scale value + * - s_o as output quantization scale value + * - z_n as new quantization offset value + * - s_n as new quantization scale value + * + * q_o = ( q_i - z_i ) * s_i / s_o + z_o + * + * We can rewrite the formula as: + * + * q_o = ( q_i * s_i / s_o ) - z_i * s_i / s_o + z_o + * + * q_o = q_i / s_n + z_n + * + * Where: + * + * s_n = s_o / s_i + * + * z_n = - z_i * s_i / s_o + z_o + * + */ + const UniformQuantizationInfo qinfo_in = src->quantization_info().uniform(); + scale_to_apply /= qinfo_in.scale; + // In order to minimize flooring we convert the offset to a float, + // then compute the new offset in the float domain, + // finally we convert it back as int32_t + offset_to_apply -= static_cast<int32_t>(static_cast<float>(qinfo_in.offset) * qinfo_in.scale / qinfo.scale); + } + + // Create kernel + CLBuildOptions build_opts; + build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOAT"); + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(scale_to_apply)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(offset_to_apply)); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type)); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); + std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type); + build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first)); + build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second)); + + _kernel = create_kernel(compile_context, "quantization_layer", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps()); + if(multi_access_x) + { + win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + } + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClQuantizeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + return Status{}; +} + +void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3); + Window slice = window_collapsed.first_slice_window_3D(); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window_collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.h b/src/gpu/cl/kernels/ClQuantizeKernel.h new file mode 100644 index 0000000000..aeab28febe --- /dev/null +++ b/src/gpu/cl/kernels/ClQuantizeKernel.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_QUANTIZE_KERNEL_H +#define ARM_COMPUTE_CL_QUANTIZE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the quantization layer kernel. + * + * @note The implementation supports only 3D input tensors. + */ +class ClQuantizeKernel : public IClKernel +{ +public: + ClQuantizeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClQuantizeKernel); + /** Set the input, output. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F32/F16. + * @param[out] dst Destination tensor info with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. + * + * @note Output auto initialization is not supported by this kernel + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClQuantizeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_QUANTIZE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp new file mode 100644 index 0000000000..246bd9c838 --- /dev/null +++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp @@ -0,0 +1,134 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClReshapeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" + +#include <string> + +/** [ClReshapeKernel Kernel] **/ +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + if(dst->tensor_shape().total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().total_size() != dst->tensor_shape().total_size()); + } + + return Status{}; +} +} // namespace + +ClReshapeKernel::ClReshapeKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClReshapeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); + + auto padding_info = get_padding_info({ src, dst }); + + // Create kernel + std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) }; + _kernel = create_kernel(compile_context, "reshape_layer", build_opts); + + // Add static arguments + const cl_int2 src_shape = + { + { + static_cast<cl_int>(src->tensor_shape()[0]), + static_cast<cl_int>(src->tensor_shape()[1]) + } + }; + const cl_int2 dst_shape = + { + { + static_cast<cl_int>(dst->tensor_shape()[0]), + static_cast<cl_int>(dst->tensor_shape()[1]) + } + }; + unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters + _kernel.setArg<cl_int2>(idx++, src_shape); + _kernel.setArg<cl_int2>(idx++, dst_shape); + + // Configure kernel window + Window win = calculate_max_window(*src); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst)); + + return Status{}; +} + +void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_3D(); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + // Set srcs + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, window_collapsed); + add_3D_tensor_argument(idx, dst, window_collapsed); + enqueue(queue, *this, slice, lws_hint()); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +/** [ClReshapeKernel Kernel] **/ diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h new file mode 100644 index 0000000000..db6ab5da58 --- /dev/null +++ b/src/gpu/cl/kernels/ClReshapeKernel.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_RESHAPE_KERNEL_H +#define ARM_COMPUTE_CL_RESHAPE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the kernel to perform tensor reshaping */ +class ClReshapeKernel : public IClKernel +{ +public: + ClReshapeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClReshapeKernel); + /** Set the src and dst of the kernel + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data type supported: All. + * @param[out] dst Destination tensor info. Data type supported: Same as @p src + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClReshapeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace opencl +} // namespace kernels +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp new file mode 100644 index 0000000000..9307f7d4fb --- /dev/null +++ b/src/gpu/cl/kernels/ClScaleKernel.cpp @@ -0,0 +1,213 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClScaleKernel.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/TensorInfo.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/ScaleUtils.h" +#include "support/Cast.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +inline std::pair<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners) +{ + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + + // Compute the ratio between source width/height and destination width/height + const unsigned int src_width = src->dimension(idx_width); + const unsigned int src_height = src->dimension(idx_height); + const unsigned int dst_width = dst->dimension(idx_width); + const unsigned int dst_height = dst->dimension(idx_height); + + float scale_x = arm_compute::scale_utils::calculate_resize_ratio(src_width, dst_width, align_corners); + float scale_y = arm_compute::scale_utils::calculate_resize_ratio(src_height, dst_height, align_corners); + + return std::make_pair(scale_x, scale_y); +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(dst == src); + ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type())); + + float scale_x = 0.f; + float scale_y = 0.f; + const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners); + + ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f)); + + return Status{}; +} +} // namespace + +Status ClScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, info)); + return Status{}; +} + +ClScaleKernel::ClScaleKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info)); + auto padding_info = get_padding_info({ src, dst }); + + // Info required for the static tuning + _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + + const bool is_nhwc = _data_layout == DataLayout::NHWC; + + float scale_x = 0.f; + float scale_y = 0.f; + std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners); + const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR; + + // Area interpolation behaves as Nearest Neighbour in case of up-sampling + auto interpolation_policy_to_use = info.interpolation_policy; + if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f) + { + interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR; + } + + // Create kernel + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const unsigned int src_width = src->dimension(idx_width); + const unsigned int src_height = src->dimension(idx_height); + const unsigned int dst_width = dst->dimension(idx_width); + const unsigned int vec_size = adjust_vec_size(is_nhwc ? 1 : 4, dst_width); + const unsigned int vec_size_leftover = (dst_width % vec_size); + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); + build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); + build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); + + build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); + build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); + build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); + build_opts.add_option_if(!is_nhwc, "-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover))); + build_opts.add_option_if(is_nhwc, "-DDEPTH_OUT=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); + build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); + if(is_qasymm_bilinear) + { + const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); + build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)); + } + std::string interpolation_name = string_from_interpolation_policy(interpolation_policy_to_use); + std::transform(interpolation_name.begin(), interpolation_name.end(), interpolation_name.begin(), ::tolower); + std::string kernel_name = "scale_" + interpolation_name + "_"; + kernel_name += lower_string(string_from_data_layout(_data_layout)); + + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps(vec_size)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + + // Set config_id for enabling LWS tuning + _config_id = "scale_"; + _config_id += (info.border_mode == BorderMode::REPLICATE ? "Bord_rep" : ""); + _config_id += (info.sampling_policy == SamplingPolicy::CENTER ? "center" : "topleft"); + _config_id += (is_nhwc ? "nhwc" : "nchw"); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(3)); +} + +void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + switch(_data_layout) + { + case DataLayout::NCHW: + { + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, src, slice); + add_2D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_2D(slice)); + break; + } + case DataLayout::NHWC: + { + Window collapsed = window.collapse(ICLKernel::window(), Window::DimZ); + Window slice = collapsed.first_slice_window_4D(); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, slice); + add_4D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + break; + } + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h new file mode 100644 index 0000000000..dd09e92ee2 --- /dev/null +++ b/src/gpu/cl/kernels/ClScaleKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_SCALE_KERNEL_H +#define ARM_COMPUTE_CL_SCALE_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the scale kernel */ +class ClScaleKernel : public IClKernel +{ +public: + ClScaleKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClScaleKernel); + /** Initialise the kernel's inputs, output and interpolation policy + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32 + * @param[out] dst Destination tensor info. Data types supported: Same as @p src + * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to configure. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClScaleKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + DataLayout _data_layout{ DataLayout::UNKNOWN }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_SCALE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp new file mode 100644 index 0000000000..4c00413469 --- /dev/null +++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp @@ -0,0 +1,365 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClSoftmaxKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +/** Calculates softmax parameters from the quantized input scale and scaling factor for the exponent and places them as build options. + * + * Prepares these build options: + * -INPUT_BETA_MULTIPLIER, INPUT_BETA_LEFT_SHIFT - quantized representation of beta multiplier. + * -DIFF_MIN - threshold difference between maximum value of input data and current processed value, + * it defines whether the value will be taken into account or not. + * + * @param[in] build_opts Build options to extend + * @param[in] input_scale Input scaling factor + * @param[in] beta Exponent scaling factor beta + */ +CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float beta) +{ + // Number of integer bits in temporary fixed-point representation of current-to-max difference + static const int scaled_diff_int_bits = 5; + // Number of integer bits used in temporary fixed-point representation of exponent accumulator + static const int exp_accumulation_in_bits = 12; + + const double beta_multiplier = std::min( + 1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), + (1LL << 31) - 1.0); + int input_beta_multiplier; + int input_beta_left_shift; + quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift); + + const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift); + const int diff_min = -1.f * std::floor(max_input_rescaled); + + CLBuildOptions build_opts; + build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits)); + build_opts.add_option("-DEXP_ACCUMULATION_INT_BITS=" + support::cpp11::to_string(exp_accumulation_in_bits)); + build_opts.add_option("-DINPUT_BETA_MULTIPLIER=" + support::cpp11::to_string(input_beta_multiplier)); + build_opts.add_option("-DINPUT_BETA_LEFT_SHIFT=" + support::cpp11::to_string(input_beta_left_shift)); + build_opts.add_option("-DDIFF_MIN=" + support::cpp11::to_string(diff_min)); + + return build_opts; +} + +Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); + + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); + + // Checks performed when output is configured + if(dst.total_size() != 0) + { + if(is_quantized_asymmetric) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); + } + + // Checks performed when sum is configured + if(sum.total_size() != 0) + { + if(is_quantized_asymmetric) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&max, &sum); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&max, &sum); + } + + return Status{}; +} + +Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &sum); + ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type)); + + // Note: output should always have a scale of 1/256 and offset 0 + const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); + + // Checks performed when output is configured + if(dst.total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); + if(!is_quantized_asymmetric) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != allowed_quantization_info); + } + } + + return Status{}; +} +} // namespace + +/**< Grid size (obtained through auto-tuning) */ +const unsigned int ClLogits1DMaxShiftExpSumKernel::_grid_size = 64; +/**< Vector size in the serial case (obtained through auto-tuning) */ +const unsigned int ClLogits1DMaxShiftExpSumKernel::_serial_vector_size = 8; +/**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/ +const unsigned int ClLogits1DMaxShiftExpSumKernel::_parallel_vector_size = 4; + +ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info) +{ + auto padding_info = get_padding_info({ &src, &max, &dst, &sum }); + + // Output auto initialization if not yet initialized + auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape())); + auto_init_if_empty(dst, *src.clone()); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum)); + + const DataType dt = src.data_type(); + const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); + const size_t reduction_dim_size = src.dimension(0); + const float beta = info.beta; + const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type); + const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0; + + ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(reduction_dim_size); + const unsigned int vector_size = adjust_vec_size(std::get<1>(parallel_reduction_info), reduction_dim_size); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)); + build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value)); + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(reduction_dim_size)); + build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(reduction_dim_size % vector_size)); + build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size)))); + build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE"); + build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED"); + build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX"); + build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX"))); + build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options()); + + cl::NDRange lws_hint(cl::NullRange); + std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : ""); + + // Configure parallel kernel if needed + if(std::get<0>(parallel_reduction_info)) + { + kernel_name += "parallel"; + bool is_grid_size_pow2 = (_grid_size != 0) && ((_grid_size & (_grid_size - 1)) == 0); + build_opts.add_option_if(is_grid_size_pow2 && _grid_size <= 256, "-DGRID_SIZE=" + support::cpp11::to_string(_grid_size)); + + // Handle boundary conditions. + const unsigned int multiple_grid_size = (reduction_dim_size / vector_size) % _grid_size; + build_opts.add_option_if((multiple_grid_size != 0) || ((reduction_dim_size % vector_size) != 0), "-DNON_MULTIPLE_OF_GRID_SIZE"); + // Setting _lws_hint in this way can also communicate grid_size to ClLogits1DMaxShiftExpSumKernel::run(). + // A single workgroup performs reduction in dimension 0 in the parallel case, hence lws[0]==gws[0]. + lws_hint = cl::NDRange(_grid_size); + } + else + { + kernel_name += "serial"; + } + + // Create kernel. + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure window + Window win = calculate_max_window(src, Steps(reduction_dim_size)); + IClKernel::configure_internal(win, lws_hint); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum)); + return Status{}; +} + +ClLogits1DMaxShiftExpSumKernel::ParallelReductionInfo ClLogits1DMaxShiftExpSumKernel::is_parallel_reduction(size_t size) +{ + bool is_parallel_reduction = (size >= (_grid_size * _serial_vector_size)) && (_grid_size > 1); + unsigned int vector_size = is_parallel_reduction ? _parallel_vector_size : _serial_vector_size; + return std::make_tuple(is_parallel_reduction, vector_size); +} + +void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + auto max = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0)); + auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_1)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, max, sum); + + // Collapse window in Z dimension + Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); + + // Reconfigure window in case of parallel reduction + ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0)); + if(std::get<0>(parallel_reduction_info)) + { + // Launch grid_size parallel work items + window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1)); + } + + // Get slices + Window slice = window_collapsed.first_slice_window_3D(); + do + { + unsigned int idx = 0; + // Set inputs + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, max, slice); + add_3D_tensor_argument(idx, dst, slice); + add_3D_tensor_argument(idx, sum, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window_collapsed.slide_window_slice_3D(slice)); +} + +ClLogits1DNormKernel::ClLogits1DNormKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info) +{ + auto padding_info = get_padding_info({ &src, &dst, &sum }); + + // Note: output should always have a scale of 1/256 and offset 0 + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); + const DataType output_data_type = info.input_data_type; + const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); + const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); + + // Output auto initialization if not yet initialized + auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info)); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info)); + + const auto is_signed_qasymm8 = is_data_type_quantized_asymmetric_signed(info.input_data_type); + const int min_value = is_signed_qasymm8 ? CL_SCHAR_MIN : 0; + const unsigned int vector_size = adjust_vec_size(16, src.dimension(0)); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(info.input_data_type)); + build_opts.add_option("-DMIN_VALUE=" + support::cpp11::to_string(min_value)); + build_opts.add_option("-DVECTOR_SIZE=" + support::cpp11::to_string(vector_size)); + build_opts.add_option("-DVECTOR_SIZE_LEFTOVER=" + support::cpp11::to_string(src.dimension(0) % vector_size)); + build_opts.add_option_if(is_data_type_quantized_asymmetric_signed(info.input_data_type), "-DQASYMM8_SIGNED"); + build_opts.add_options_if(is_quantized_asymmetric, + prepare_quantized_softmax_build_options(qinfo.scale, info.beta).options()); + build_opts.add_option_if(info.is_log, "-DLOG_SOFTMAX"); + + // Create kernel + std::string kernel_name = std::string("softmax_layer_norm") + (is_quantized_asymmetric ? "_quantized" : ""); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure window + auto win = calculate_max_window(src, Steps(vector_size)); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info)); + + return Status{}; +} + +void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + auto sum = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0)); + + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst, sum); + + Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); + Window slice = window_collapsed.first_slice_window_3D(); + + do + { + Window sum_slice = slice; + sum_slice.set(Window::DimX, Window::Dimension(0, 1, 1)); + + unsigned int idx = 0; + // Set inputs + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, sum, sum_slice); + add_3D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window_collapsed.slide_window_slice_3D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h new file mode 100644 index 0000000000..a221e12132 --- /dev/null +++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h @@ -0,0 +1,118 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_SOFTMAX_KERNEL_H +#define ARM_COMPUTE_CL_SOFTMAX_KERNEL_H + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for max, shifting, exponentiating and summing the logits */ +class ClLogits1DMaxShiftExpSumKernel : public IClKernel +{ + /**< Grid size (obtained through auto-tuning) */ + static const unsigned int _grid_size; + /**< Vector size in the serial case (obtained through auto-tuning) */ + static const unsigned int _serial_vector_size; + /**< Vector size in the parallel case (obtained through auto-tuning, enables the best memory access pattern for Bifrost) .*/ + static const unsigned int _parallel_vector_size; + +public: + /** Info for whether a parallel reduction will be run and the vector size of the execution. */ + using ParallelReductionInfo = std::tuple<bool, unsigned int>; + + ClLogits1DMaxShiftExpSumKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DMaxShiftExpSumKernel); + /** Configure the kernel using the given information about tensors + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 + * @param[in,out] max Max values tensor. Data types supported: same as @p src + * @param[out] dst Destination tensor. Data types supported: same as @p src + * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p src + * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum); + /** Checks if the given size is eligible for parallel reduction + * + * @note Serial reduction is launched for width < (_grid_size * _serial_vector_size). + * @note Parallel reduction is launched for width >= (_grid_size * _serial_vector_size) and vector_size is forced to 4. + * + * @param[in] size Size to check + * + * @return A two-element tuple where the first element is a boolean specifying if a parallel reduction will be run, + * while the second element is the vector size of the execution. + */ + static ParallelReductionInfo is_parallel_reduction(size_t size); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; +}; + +/** Interface for calculating the final step of the Softmax Layer where each logit value is multiplied by the inverse of the sum of the logits. */ +class ClLogits1DNormKernel : public IClKernel +{ +public: + ClLogits1DNormKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClLogits1DNormKernel); + + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data types supported: S32/F16/F32. If this kernel is used for log softmax, only F32/F16 is supported. + * @param[in] sum Sum tensor. Dimensions should be dim(input)-1. Data types supported: same as @p input + * @param[out] dst Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input + * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClLogits1DNormKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_SOFTMAX_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp new file mode 100644 index 0000000000..bdce2a2f5c --- /dev/null +++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp @@ -0,0 +1,124 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClTransposeKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +ClTransposeKernel::ClTransposeKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClTransposeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output auto initialization if not yet initialized + const TensorShape dst_shape = misc::shape_calculator::compute_transposed_shape(*src); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); + + ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst)); + auto padding_info = get_padding_info({ src, dst }); + + // Create kernel + const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); + const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; + const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); + const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; + + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE_IN_BYTES=" + support::cpp11::to_string(src->element_size())); + build_opts.add_option("-DVEC_SIZE_X=" + support::cpp11::to_string(vec_size_x)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER_X=" + support::cpp11::to_string(vec_size_x_leftovers)); + build_opts.add_option("-DVEC_SIZE_Y=" + support::cpp11::to_string(vec_size_y)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER_Y=" + support::cpp11::to_string(vec_size_y_leftovers)); + + _kernel = create_kernel(compile_context, "transpose", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*src, Steps(vec_size_x, vec_size_y)); + ICLKernel::configure_internal(win, cl::NDRange(2, 8)); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported"); + + // Validate configured dst + if(dst->total_size() != 0) + { + const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + } + + return Status{}; +} + +void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window slice = window.first_slice_window_2D(); + + do + { + unsigned int idx = 0; + add_2D_tensor_argument(idx, src, slice); + add_2D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_2D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClTransposeKernel.h b/src/gpu/cl/kernels/ClTransposeKernel.h new file mode 100644 index 0000000000..b30d6f0281 --- /dev/null +++ b/src/gpu/cl/kernels/ClTransposeKernel.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H +#define ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to transpose a 2D tensor. */ +class ClTransposeKernel : public IClKernel +{ +public: + ClTransposeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClTransposeKernel); + /** Set the src and dst of the kernel. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClTransposeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_TRANSPOSE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp new file mode 100644 index 0000000000..8f36345076 --- /dev/null +++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp @@ -0,0 +1,164 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +using namespace misc::shape_calculator; +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON(num_groups == 0); + ARM_COMPUTE_RETURN_ERROR_ON(input->data_layout() == DataLayout::NHWC && num_groups > 1); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1); + ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0); + + if(biases != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); + ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1)); + ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2)); + ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3])); + ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4])); + } + + // Checks performed when output is configured + if(output->total_size() != 0) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); + } + + return Status{}; +} +} // namespace + +ClWeightsReshapeKernel::ClWeightsReshapeKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups))); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups)); + auto padding_info = get_padding_info({ src, biases, dst }); + + const DataType data_type = src->data_type(); + + // Create build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(data_type))); + build_opts.add_option("-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); + build_opts.add_option_if(biases != nullptr, "-DHAS_BIAS"); + + // Create kernel + _kernel = create_kernel(compile_context, "reshape_to_columns", build_opts.options()); + + // Configure window + Window win = calculate_max_window(*src, Steps()); + ICLKernel::configure_internal(win); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups)); + return Status{}; +} + +void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window out_window; + out_window.use_tensor_dimensions(dst->info()->tensor_shape()); + + Window in_slice = window.first_slice_window_3D(); + Window out_slice = out_window.first_slice_window_2D(); + + Window biases_window; + Window biases_slice; + + unsigned int idx = num_arguments_per_3D_tensor() + num_arguments_per_2D_tensor(); + idx += (biases != nullptr) ? num_arguments_per_1D_tensor() : 0; + _kernel.setArg<cl_uint>(idx++, src->info()->dimension(0)); + _kernel.setArg<cl_uint>(idx++, src->info()->dimension(1)); + _kernel.setArg<cl_uint>(idx++, src->info()->dimension(2)); + _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3)); + _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z()); + + if(biases != nullptr) + { + biases_window.use_tensor_dimensions(biases->info()->tensor_shape()); + biases_slice = biases_window.first_slice_window_1D(); + } + + do + { + // Set arguments + unsigned idx = 0; + add_3D_tensor_argument(idx, src, in_slice); + add_2D_tensor_argument(idx, dst, out_slice); + if(biases != nullptr) + { + add_1D_tensor_argument(idx, biases, biases_slice); + ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice)); + } + + // Run kernel + enqueue(queue, *this, in_slice, lws_hint()); + } + while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h new file mode 100644 index 0000000000..7364eb97ae --- /dev/null +++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h @@ -0,0 +1,93 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H +#define ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to perform reshaping on the weights used by convolution and locally connected layer + * + * Rearranges each 3-dimensional kernel to a single row leading to a matrix with linearized kernels. + * In combination with the @ref opencl::kernels::ClIm2ColKernel can transform a convolution to a matrix multiplication. + * + * For example assuming a 3D weight kernel of 3x3 dimensions and depth of 2 we have: + * @f[ + * \left( \begin{array}{ccc} + * a000 & a001 & a002 \\ + * a010 & a011 & a012 \\ + * a020 & a021 & a022 \\ + * \end{array} \right) + * \left( \begin{array}{ccc} + * a100 & a101 & a102 \\ + * a110 & a111 & a112 \\ + * a120 & a121 & a122 \\ + * \end{array} \right) + * \rightarrow + * \left( \begin{array}{ccccccccc} + * a000 & a001 & a002 & a010 & a011 & a012 & a020 & a021 & a022 & a100 & a101 & a102 & a110 & a111 & a112 & a120 & a121 & a122 \\ + * \end{array} \right) + * @f] + */ +class ClWeightsReshapeKernel : public IClKernel +{ +public: + ClWeightsReshapeKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWeightsReshapeKernel); + /** Set the input and output of the kernel. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The input tensor info to convert. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] if shared, + * and 5D tensor with dimensions [kernel_x, kernel_y, IFM, OFM, num_patches] if unshared. Data types supported: All + * @param[in] biases The shared biases tensor info to append. Bias is 1D tensor with dimensions [OFM] if shared and 2D tensor with + * dimensions [OFM, num_patches] if unshared. Data types supported: F16/F32, for quantized types this must be nullptr. + * @warning Appending biases to weights reshaped matrix is not supported for quantized asymmetric types. + * @param[out] dst The output tensor info. Should be a 2D Tensor if there are no groups and the weights are not shared; a 3D Tensor otherwise. + * Data types supported: Same as @p input + * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout + * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClWeightsReshapeKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp new file mode 100644 index 0000000000..6e7b7f6e14 --- /dev/null +++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp @@ -0,0 +1,159 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/tensor_info.h" +#include "support/Cast.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0)); + + for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); + ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); + } + ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4); + + return Status{}; +} +} // namespace + +Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst)); + return Status{}; +} + +ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst)); + + auto padding_info = get_padding_info({ src1, src2, dst }); + + const unsigned int min_dimension = std::min(src1->dimension(0), src2->dimension(0)); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); + const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration; + + // Add build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0))); + build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0))); + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); + build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + + // If input have different quantization info set quantization parameters needed for the re-quantization process + const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2); + if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) + { + const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); + build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset)); + build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + + // Create kernel + _kernel = create_kernel(compile_context, "concatenate_width_x2", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + + // Set config_id for enabling LWS tuning + _config_id = "concatenate_width_x2_"; + _config_id += lower_string(string_from_data_type(src1->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(1)); +} + +void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + Window slice = window.first_slice_window_4D(); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, src0, slice); + add_4D_tensor_argument(idx, src1, slice); + add_4D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, window, lws_hint()); + } + while(window.slide_window_slice_4D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h new file mode 100644 index 0000000000..8065fb9f75 --- /dev/null +++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h @@ -0,0 +1,67 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H +#define ARM_COMPUTE_CL_WIDTHCONCATENATE_2TENSORS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the width concatenate kernel of 2 tensors. + * The src1 and src2 tensors will be concatenated into the dst tensor. + */ +class ClWidthConcatenate2TensorsKernel : public IClKernel +{ +public: + ClWidthConcatenate2TensorsKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate2TensorsKernel); + /** Initialise the kernel's sources and destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: All. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1 + * @param[out] dst Destination tensor info. Data types supported: Same as @p src1. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClWidthConcatenate2TensorsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_2TENSORS_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp new file mode 100644 index 0000000000..a08490c565 --- /dev/null +++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "src/core/utils/helpers/tensor_info.h" +#include "support/Cast.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); + ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0)); + + for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); + ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); + ARM_COMPUTE_RETURN_ERROR_ON(src3->dimension(i) != dst->dimension(i)); + ARM_COMPUTE_RETURN_ERROR_ON(src4->dimension(i) != dst->dimension(i)); + } + ARM_COMPUTE_RETURN_ERROR_ON(src1->num_dimensions() > 4); + + return Status{}; +} +} // namespace + +ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst)); + return Status{}; +} + +void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, ITensorInfo *src2, + ITensorInfo *src3, ITensorInfo *src4, + ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst)); + + auto padding_info = get_padding_info({ src1, src2, src3, src4, dst }); + const unsigned int min_dimension = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0))); + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); + const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration; + + // Add build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1->data_type())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src1->dimension(2))); + build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(src1->dimension(0))); + build_opts.add_option("-DINPUT2_WIDTH=" + support::cpp11::to_string(src2->dimension(0))); + build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(src3->dimension(0))); + build_opts.add_option("-DINPUT4_WIDTH=" + support::cpp11::to_string(src4->dimension(0))); + build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); + build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + + // If soources have different quantization info set quantization parameters needed for the re-quantization process + const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4); + if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) + { + const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); + const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); + const UniformQuantizationInfo iq3_info = src3->quantization_info().uniform(); + const UniformQuantizationInfo iq4_info = src4->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iq1_info.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iq1_info.scale)); + build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(iq2_info.offset)); + build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(iq2_info.scale)); + build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(iq3_info.offset)); + build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(iq3_info.scale)); + build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(iq4_info.offset)); + build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(iq4_info.scale)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oq_info.offset)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oq_info.scale)); + } + + // Create kernel + _kernel = create_kernel(compile_context, "concatenate_width_x4", build_opts.options()); + + // Configure kernel window + Window win = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); + + // Set config_id for enabling LWS tuning + _config_id = "concatenate_width_x4_"; + _config_id += lower_string(string_from_data_type(src1->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src1->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src2->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src3->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src3->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src4->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src4->dimension(1)); +} + +void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); + const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); + const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2)); + const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + Window slice = window.first_slice_window_4D(); + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, src0, slice); + add_4D_tensor_argument(idx, src1, slice); + add_4D_tensor_argument(idx, src2, slice); + add_4D_tensor_argument(idx, src3, slice); + add_4D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, window, lws_hint()); + } + while(window.slide_window_slice_4D(slice)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h new file mode 100644 index 0000000000..80afb3b85d --- /dev/null +++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h @@ -0,0 +1,70 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ + +#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H +#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the width concatenate kernel of 4 tensors. + * All source tensors will be concatenated into the destination tensor. + */ +class ClWidthConcatenate4TensorsKernel : public IClKernel +{ +public: + ClWidthConcatenate4TensorsKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenate4TensorsKernel); + /** Initialise the kernel's sources and destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: All. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1 + * @param[in] src3 Third source tensor info. Data types supported: same as @p src1 + * @param[in] src4 Fourth source tensor info. Data types supported: same as @p src1 + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClWidthConcatenate4TensorsKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_4TENSORS_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp new file mode 100644 index 0000000000..88b5a5e334 --- /dev/null +++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp @@ -0,0 +1,127 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Utils.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" + +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); + ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); + + for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + { + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); + } + ARM_COMPUTE_RETURN_ERROR_ON(src->num_dimensions() > 4); + + return Status{}; +} +} // namespace + +ClWidthConcatenateKernel::ClWidthConcatenateKernel() +{ + _type = CLKernelType::ELEMENTWISE; +} + +Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, width_offset, dst)); + return Status{}; +} + +void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst)); + + auto padding_info = get_padding_info({ src, dst }); + + const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0)); + + // Add build options + CLBuildOptions build_opts; + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset)); + build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(src->dimension(2))); + + if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + { + const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); + const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); + + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(iqinfo.offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(oqinfo.offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(iqinfo.scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale)); + } + + // Create kernel + _kernel = create_kernel(compile_context, "concatenate_width", build_opts.options()); + // Configure kernel window + Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); + ICLKernel::configure_internal(win.collapse(win, Window::DimZ)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); + + const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, window); + add_4D_tensor_argument(idx, dst, window); + enqueue(queue, *this, window, lws_hint()); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h new file mode 100644 index 0000000000..71df077ada --- /dev/null +++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H +#define ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H + +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the width concatenate kernel. + * The source tensor will be concatenated into the destination tensor. + */ +class ClWidthConcatenateKernel : public IClKernel +{ +public: + ClWidthConcatenateKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWidthConcatenateKernel); + /** Initialise the kernel's source and destination + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: All. + * @param[in] width_offset The offset on the X axis. + * @param[in,out] dst Destination tensor info. Data types supported: same as @p src. + * + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClWidthConcatenateKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, unsigned int width_offset, const ITensorInfo *dst); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WIDTH_CONCATENATE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp new file mode 100644 index 0000000000..4ba6ba8a9a --- /dev/null +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp @@ -0,0 +1,156 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + + const Size2D kernel_size = winograd_info.kernel_size; + const Size2D output_tile_size = winograd_info.output_tile_size; + + const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height); + ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); + + // Checks performed when output is configured + if(output->total_size() != 0) + { + const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_UNUSED(output); + + const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1; + const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); + const unsigned int num_elems_read_per_iteration_z = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2); + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z)); + Window win_collapsed = win.collapse(win, Window::DimZ); + return std::make_pair(Status{}, win_collapsed); +} +} // namespace + +ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel() +{ + _type = CLKernelType::WINOGRAD; +} + +void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info))); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); + auto padding_info = get_padding_info({ src, dst }); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DSRC_DIM_Z=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_FILTER_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_FILTER_TRANSFORM_VERTICAL"); + const Size2D kernel_size = winograd_info.kernel_size; + const Size2D output_tile_size = winograd_info.output_tile_size; + + // Create kernel + std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout())); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second); + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); +} + +Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); + + return Status{}; +} + +void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + // Setup output window + Window window_out; + window_out.use_tensor_dimensions(dst->info()->tensor_shape(), 0); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, window); + add_3D_tensor_argument(idx, dst, window_out); + enqueue(queue, *this, window, lws_hint()); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h new file mode 100644 index 0000000000..fe0c3da174 --- /dev/null +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h @@ -0,0 +1,77 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H +#define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the Winograd filter transform kernel. */ +class ClWinogradFilterTransformKernel : public IClKernel +{ +public: + ClWinogradFilterTransformKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradFilterTransformKernel); + /** Set the input and output tensor. + * + * @note Winograd filter transform supports the following configurations for NCWH data layout + * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3), + * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * @note Winograd filter transform supports the following configurations for NHWC data layout + * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * Strides: only unit strides + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. The input is a 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM] (NCHW data layout) or [IFM, kernel_x, kernel_y, OFM] (NHWC data layout). Data types supported: F16/F32. + * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input + * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClWinogradFilterTransformKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp new file mode 100644 index 0000000000..58874216bb --- /dev/null +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp @@ -0,0 +1,278 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + + const PadStrideInfo conv_info = winograd_info.convolution_info; + const Size2D output_tile_size = winograd_info.output_tile_size; + const Size2D kernel_size = winograd_info.kernel_size; + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported"); + + ARM_COMPUTE_UNUSED(conv_info); + ARM_COMPUTE_UNUSED(output_tile_size); + ARM_COMPUTE_UNUSED(kernel_size); + + // Validate configured output + if(output->total_size() != 0) + { + const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_UNUSED(output); + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + + bool window_changed = false; + Window win = calculate_max_window(*input, Steps(1, 1)); + + if(input->data_layout() == DataLayout::NCHW) + { + const PadStrideInfo conv_info = winograd_info.convolution_info; + const Size2D output_tile_size = winograd_info.output_tile_size; + const Size2D kernel_size = winograd_info.kernel_size; + + unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1; + unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1; + + AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); + window_changed = update_window_and_padding(win, input_access); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +ClWinogradInputTransformKernel::ClWinogradInputTransformKernel() +{ + _type = CLKernelType::WINOGRAD; +} + +BorderSize ClWinogradInputTransformKernel::border_size() const +{ + return _border_size; +} + +void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); + + auto padding_info = get_padding_info({ src, dst }); + + const PadStrideInfo conv_info = winograd_info.convolution_info; + const Size2D output_tile_size = winograd_info.output_tile_size; + const Size2D kernel_size = winograd_info.kernel_size; + + _data_layout = src->data_layout(); + + const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)), + kernel_size, + output_tile_size, + conv_info); + + _num_tiles_x = num_tiles.width; + _num_tiles_y = num_tiles.height; + + const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + + // Output auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape)); + + ARM_COMPUTE_ERROR_ON(_num_tiles_x * _num_tiles_y != static_cast<int>(dst->dimension(1))); + const size_t total_batches = src->tensor_shape().total_size_upper(3); + + CLBuildOptions build_opts; + if(_data_layout == DataLayout::NHWC) + { + build_opts.add_option("-DNHWC"); + build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_w))); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_h))); + build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); + build_opts.add_option("-DNUM_TILES_Y=" + support::cpp11::to_string(_num_tiles_y)); + build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); + build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); + build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); + build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL"); + } + else + { + build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(_num_tiles_x)); + build_opts.add_option("-DPAD_LEFT=" + support::cpp11::to_string(conv_info.pad_left())); + build_opts.add_option("-DPAD_TOP=" + support::cpp11::to_string(conv_info.pad_top())); + build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); + build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_INPUT_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_INPUT_TRANSFORM_VERTICAL"); + build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); + } + + // Create kernel + std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); + + // Get the maximum dimension from the tile size + const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); + + // Check optimized kernel if output_dims == 2x2 + if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) + { + _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2; + } + + // Append stepz and data layout + kernel_name += "_stepz"; + kernel_name += support::cpp11::to_string(_step_z); + kernel_name += "_" + lower_string(string_from_data_layout(_data_layout)); + + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Create window and update padding + auto win_config = validate_and_configure_window(src, dst, winograd_info); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second, cl::NDRange(1, 1, 8)); + + _border_size = BorderSize(src->padding()); + + ARM_COMPUTE_ERROR_ON((src->data_layout() == DataLayout::NHWC) && has_padding_changed(padding_info)); + + _config_id = kernel_name; + _config_id += support::cpp11::to_string(src->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(2)); + _config_id += "_"; + _config_id += support::cpp11::to_string(conv_info.pad_left()); + _config_id += "_"; + _config_id += support::cpp11::to_string(conv_info.pad_top()); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(_data_layout)); +} + +Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); + return Status{}; +} + +void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + const size_t idx_w = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const size_t total_batches = window.shape().total_size_upper(3); + + // Collapse window + Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); + + if(_data_layout == DataLayout::NHWC) + { + Window slice = window_collapsed.first_slice_window_3D(); + slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1)); + slice.set(2, Window::Dimension(0, total_batches, 1)); + + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, slice); + add_4D_tensor_argument(idx, dst, slice); + enqueue(queue, *this, slice, lws_hint()); + } + else + { + Window slice = window_collapsed.first_slice_window_3D(); + slice.set(idx_w, Window::Dimension(0, _num_tiles_x, 1)); + slice.set(idx_h, Window::Dimension(0, _num_tiles_y, 1)); + + ARM_COMPUTE_ERROR_ON(((slice[idx_c].end() - slice[idx_c].start()) % _step_z) != 0); + slice.set(idx_c, Window::Dimension(slice[idx_c].start(), slice[idx_c].end(), _step_z)); + + unsigned int idx = 2 * num_arguments_per_3D_tensor(); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3])); + _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[3])); + + do + { + unsigned int idx = 0; + add_3D_tensor_argument(idx, src, slice); + add_3D_tensor_argument(idx, dst, slice); + + enqueue(queue, *this, slice, lws_hint()); + } + while(window_collapsed.slide_window_slice_3D(slice)); + } +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h new file mode 100644 index 0000000000..631f427b82 --- /dev/null +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h @@ -0,0 +1,87 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H +#define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** OpenCL kernel to perform Winograd input transform.*/ +class ClWinogradInputTransformKernel : public IClKernel +{ +public: + ClWinogradInputTransformKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradInputTransformKernel); + /** Set the input and output of the kernel. + * + * @note Winograd input transform supports the following configurations for NCWH data layout + * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3), + * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * @note Winograd input transform supports the following configurations for NHWC data layout + * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * Strides: only unit strides + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The input tensor info to transform. Data types supported: F16/F32 + * @param[in] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input + * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClWinogradInputTransformKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + BorderSize border_size() const override; + +private: + using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>; + + BorderSize _border_size{ 0 }; + DataLayout _data_layout{ DataLayout::UNKNOWN }; + int _num_tiles_x{ 0 }; + int _num_tiles_y{ 0 }; + unsigned int _step_z{ 1 }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp new file mode 100644 index 0000000000..a8cf8234ad --- /dev/null +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -0,0 +1,268 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/IAccessWindow.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/AccessWindowStatic.h" +#include "src/core/CL/CLValidate.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/WindowHelpers.h" +#include "support/Cast.h" +#include "support/StringSupport.h" + +#include <cmath> + +using namespace arm_compute::misc::shape_calculator; + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace +{ +Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_UNUSED(act_info); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); + + ARM_COMPUTE_RETURN_ERROR_ON(output->data_layout() != winograd_info.output_data_layout); + + const PadStrideInfo conv_info = winograd_info.convolution_info; + const Size2D output_tile_size = winograd_info.output_tile_size; + const Size2D kernel_size = winograd_info.kernel_size; + const Size2D input_dimensions = winograd_info.input_dimensions; + const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels"); + + // Compute number of elements to process in the X and Y direction + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, + kernel_size, + output_tile_size, + conv_info); + + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area()))); + + if(bias != nullptr) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); + } + + // Checks performed when output is configured + if(output->total_size() != 0) + { + const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); + + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); + } + + return Status{}; +} + +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); + ARM_COMPUTE_UNUSED(bias); + + constexpr unsigned int num_elems_processed_per_iteration = 1; + + Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); + bool window_changed = false; + + if(output->data_layout() == DataLayout::NCHW) + { + const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width); + const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height); + + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y); + window_changed = update_window_and_padding(win, input_access, output_access); + } + + Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + return std::make_pair(err, win); +} +} // namespace + +ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel() +{ + _type = CLKernelType::WINOGRAD; +} + +void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); + + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info)); + + // Configure kernel window + auto win_config = validate_and_configure_window(src, bias, dst, winograd_info.output_tile_size); + ARM_COMPUTE_ERROR_THROW_ON(win_config.first); + IClKernel::configure_internal(win_config.second); + + auto padding_info = get_padding_info({ src, bias, dst }); + + _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC; + + // Compute num_tiles_x + const Size2D input_dimensions = winograd_info.input_dimensions; + const Size2D kernel_size = winograd_info.kernel_size; + const Size2D output_tile_size = winograd_info.output_tile_size; + const PadStrideInfo conv_info = winograd_info.convolution_info; + const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); + + // Compute the number of output tiles along the x and y direction of size "output_tile_size" + const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, + kernel_size, + output_tile_size, + conv_info); + const size_t total_batches = dst->tensor_shape().total_size_upper(3); + + // Set build options + CLBuildOptions build_opts; + build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + + if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) + { + build_opts.add_option("-DVEC_SIZE=2"); + } + else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) + { + build_opts.add_option("-DVEC_SIZE=4"); + } + + build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); + build_opts.add_option("-cl-fast-relaxed-math"); + build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); + build_opts.add_option("-DNUM_TILES_X=" + support::cpp11::to_string(num_tiles.width)); + build_opts.add_option("-DOUTPUT_TILE_W=" + support::cpp11::to_string(output_tile_size.width)); + build_opts.add_option("-DOUTPUT_TILE_H=" + support::cpp11::to_string(output_tile_size.height)); + build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); + build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(1))); + build_opts.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(idx_width))); + build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); + build_opts.add_option_if(total_batches > 1, "-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(2))); + build_opts.add_option_if(winograd_info.kernel_size.height == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_HORIZONTAL"); + build_opts.add_option_if(winograd_info.kernel_size.width == 1, "-DWINOGRAD_OUTPUT_TRANSFORM_VERTICAL"); + + // Create kernel + std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout)); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + + // Set config_id for enabling LWS tuning + _config_id = kernel_name; + _config_id += "_"; + _config_id += lower_string(string_from_data_type(src->data_type())); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(src->dimension(1)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(0)); + _config_id += "_"; + _config_id += support::cpp11::to_string(dst->dimension(1)); + _config_id += "_"; + _config_id += lower_string(string_from_data_layout(winograd_info.output_data_layout)); + + ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc); +} + +Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first); + return Status{}; +} + +void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +{ + ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); + ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + // Collapse window + Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); + + // Get initial windows + Window slice = window_collapsed.first_slice_window_4D(); + slice.set(Window::DimZ, Window::Dimension(0, 1, 1)); + + // Setup output slice + Window slice_out(slice); + slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); + slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); + + if(bias != nullptr) + { + unsigned int idx1 = 2 * num_arguments_per_4D_tensor(); + Window slice_biases; + slice_biases.use_tensor_dimensions(bias->info()->tensor_shape()); + add_1D_tensor_argument(idx1, bias, slice_biases); + } + + if(_is_nhwc) + { + unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0); + _kernel.setArg(idx2, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y())); + } + + do + { + unsigned int idx = 0; + add_4D_tensor_argument(idx, src, slice); + add_4D_tensor_argument(idx, dst, slice_out); + enqueue(queue, *this, slice, lws_hint()); + } + while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); +} +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h new file mode 100644 index 0000000000..674d52c904 --- /dev/null +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h @@ -0,0 +1,85 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H +#define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/core/common/Macros.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +/** Interface for the Winograd output transform kernel. */ +class ClWinogradOutputTransformKernel : public IClKernel +{ +public: + ClWinogradOutputTransformKernel(); + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(ClWinogradOutputTransformKernel); + /** Set the input and output tensor. + * + * @note Winograd output transform supports the following configurations for NCWH data layout + * F(output tile, kernel size):F(2x2, 3x3), F(2x1, 3x1), F(1x2, 1x3), + * F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * @note Winograd output transform supports the following configurations for NHWC data layout + * F(output tile, kernel size):F(4x4, 3x3), F(4x1, 3x1), F(1x4, 1x3), + * F(4x4, 5x5), F(4x1, 5x1), F(1x4, 1x5) + * + * Strides: only unit strides + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info with shape [C, N, K, batches]. Data types supported: F16/F32. + * @param[in] bias Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. It can be a nullptr. Data type supported: as @p src + * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_output_transform_shape. Data types supported: Same as @p src + * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClWinogradOutputTransformKernel::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited methods overridden: + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + +private: + using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>; + + bool _is_nhwc{ false }; +}; +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H */ diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp new file mode 100644 index 0000000000..1bf27ba277 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/OpenCL.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, + bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image) +{ + ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0); + v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1)); + h0 = std::max(std::min(static_cast<int>(n / n0), static_cast<int>(h0)), static_cast<int>(1)); + + const GEMMLHSMatrixInfo lhs_info(m0, k0, v0, lhs_transpose, lhs_interleave); + const GEMMRHSMatrixInfo rhs_info(n0, k0, h0, rhs_transpose, rhs_interleave, export_to_cl_image); + + return std::make_pair(lhs_info, rhs_info); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, + unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type); + const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second); + const TensorInfo tensor_reshaped_info(shape, 1, data_type); + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) + { + return info_img; + } + else + { + return info_buf; + } +} + +void update_padding_for_cl_image(ITensorInfo *tensor) +{ + constexpr unsigned int num_floats_per_pixel = 4; + + const unsigned int stride_y_in_elements = tensor->strides_in_bytes()[1] / tensor->element_size(); + const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); + + ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment"); + if(pixel_alignment == 0) + { + return; + } + + const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel; + const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; + const unsigned int padding = round_up_width - stride_y_in_elements; + + tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0)); +} + +Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info) +{ + if(rhs_info.export_to_cl_image) + { + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.n0 == 2) || (rhs_info.n0 == 3), "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 == 2) || (rhs_info.k0 == 3), "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + + // Check the width and height of the output tensor. + // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension + const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); + const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); + } + + return Status{}; +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h new file mode 100644 index 0000000000..3fce8c9173 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_HELPERS_H +#define ARM_COMPUTE_CL_GEMM_HELPERS_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Configure @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + * + * @param[in] m Number of rows (M) in the LHS matrix not reshaped + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] m0 Number of rows processed by each thread/work-item + * @param[in] n0 Number of columns processed by each thread/work-item + * @param[in] k0 Number of inner accumulation performed by each thread/work-item + * @param[in] v0 Number of vertical blocks of size (m0xk0) stored on the same output row + * @param[in] h0 Number of horizontal blocks of size (k0xn0) stored on the same output row + * @param[in] lhs_interleave True if the v0 (m0xk0) blocks have to be interleaved in the output row + * @param[in] rhs_interleave True if the h0 (k0xn0) blocks have to be interleaved in the output row + * @param[in] lhs_transpose True if the (m0xk0) block has to be transposed before been stored + * @param[in] rhs_transpose True if the (k0xn0) block has to be transposed before been stored + * @param[in] export_to_cl_image (Optional) True if the RHS reshaped matrix has to be exported to cl_image + * + * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + */ +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, + bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false); + +/** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + * + * This function accepts two pairs of GEMMLHSMatrixInfo/GEMMRHSMatrixInfo where only the first is with cl_image2d support, + * and selects the valid one validating the GEMMRHSMatrixInfo. If the validation passes, the functions will return + * the first GEMMLHSMatrixInfo/GEMMRHSMatrixInfo pair with cl_image2d support. + * + * @param[in] info_img GEMMLHSMatrixInfo/GEMMRHSMatrixInfo with cl_image2d support + * @param[in] info_buf GEMMLHSMatrixInfo/GEMMRHSMatrixInfo to fall-back if cl_image2d cannot be used + * @param[in] n Number of columns (N) in the RHS matrix not reshaped + * @param[in] k Number of rows (K) in the RHS matrix not reshaped + * @param[in] b Batch size + * @param[in] data_type Data type + * + * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo + */ +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, + unsigned int n, unsigned int k, unsigned int b, DataType data_type); + +/** Update padding required to export the OpenCL buffer to OpenCL image2d + * + * @param[in,out] tensor ITensorInfo of the tensor required to be exported to OpenCL image2d + */ +void update_padding_for_cl_image(ITensorInfo *tensor); + +/** Utility function to validate the image2d OpenCL object support on the RHS reshaped matrix + * + * @param[in] tensor_reshaped_info TensorInfo for the RHS reshaped matrix + * @param[in] rhs_info @ref GEMMRHSMatrixInfo + * + * @return Status reporting if we can use the image2d OpenCL object on the RHS reshaped matrix + */ +Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info); +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_HELPERS_H */ diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h new file mode 100644 index 0000000000..a49836cfda --- /dev/null +++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h @@ -0,0 +1,123 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H +#define ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H + +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/Types.h" +#include "src/core/common/Macros.h" + +#include <array> +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Basic container for the OpenCL GEMM configuration functions */ +template <class T> +class CLGEMMConfigArray +{ +public: + /** Alias for F32 index */ + static constexpr size_t DT_F32 = 0; + /** Alias for F16 index */ + static constexpr size_t DT_F16 = 1; + /** Alias for Int8 index */ + static constexpr size_t DT_INT8 = 2; + + /** Constructor + * + * @param[in] func_f32 Function to call for GEMM F32 + * @param[in] func_f16 Function to call for GEMM F16 + * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) + * + */ + CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) + : _configs{ func_f32, func_f16, func_int8 } + { + } + + /** Method to return the GEMM configuration function based on data type + * + * @param[in] data_type Input data type + * + * @return the valid function otherwise it returns nullptr if the data type is not valid + */ + T get_function(DataType data_type) + { + switch(data_type) + { + case DataType::F32: + return _configs.at(DT_F32); + case DataType::F16: + return _configs.at(DT_F16); + case DataType::QASYMM8: + case DataType::QASYMM8_SIGNED: + case DataType::QSYMM8_PER_CHANNEL: + return _configs.at(DT_INT8); + default: + return nullptr; + } + } + +private: + std::array<T, 3> _configs; +}; + +/** Basic interface for the GEMM kernel configuration */ +class IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] arch GPU target + */ + IClGemmKernelConfig(GPUTarget arch) + : _target(arch) + { + } + ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig); + /** Virtual destructor */ + virtual ~IClGemmKernelConfig() = default; + /** Given M, N, K and B, this method returns the @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo to be used + * + * @param[in] m Number of rows LHS matrix + * @param[in] n Number of columns RHS matrix + * @param[in] k Number of columns LHS matrix or number of rows RHS matrix + * @param[in] b Batch size + * @param[in] data_type Data type + */ + virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; + +protected: + GPUTarget _target; +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_ICL_GEMM_KERNEL_CONFIG_H */ diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp new file mode 100644 index 0000000000..b9eac2412e --- /dev/null +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp @@ -0,0 +1,246 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32, + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_default_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G76: + func = configs_G76.get_function(data_type); + break; + case GPUTarget::G71: + func = configs_G71.get_function(data_type); + break; + default: + func = configs_G7x.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + if(m < 64) + { + return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); + } + } + } + else + { + if(m == 1) + { + if(n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false); + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n > 4196) + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false); + } + else + { + if(k < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false); + } + else if(k >= 2048 && k < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 1, false, false, false, false); + } + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 2, 1, 1, false, false, false, false); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + if(m < 64) + { + return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h new file mode 100644 index 0000000000..9af5dc4135 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h @@ -0,0 +1,62 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Bifrost based OpenCL GEMMNative configuration */ +class ClGemmDefaultConfigNativeBifrost final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigNativeBifrost(GPUTarget gpu); + + // Inherited overridden method + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_BIFROST_H */ diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp new file mode 100644 index 0000000000..b9f36c7210 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp @@ -0,0 +1,73 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, + nullptr, + &ClGemmDefaultConfigNativeMidgard::default_q8); + + auto func = configs_default.get_function(data_type); + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + const unsigned int m0 = std::min(m, static_cast<unsigned int>(4)); + const unsigned int n0 = std::min(n, static_cast<unsigned int>(4)); + + return configure_lhs_rhs_info(m, n, m0, n0, 2, 1, 1, false, false, false, false); +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h new file mode 100644 index 0000000000..c055753c48 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Midgard based OpenCL GEMMNative configuration */ +class ClGemmDefaultConfigNativeMidgard final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigNativeMidgard(GPUTarget gpu); + + // Inherited overridden method + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_MIDGARD_H */ diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp new file mode 100644 index 0000000000..95a4d2bd69 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp @@ -0,0 +1,168 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32, + &ClGemmDefaultConfigNativeValhall::configure_G77_f16, + &ClGemmDefaultConfigNativeValhall::configure_G77_u8); + + auto func = configs_default.get_function(data_type); + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 2, 1, 1, false, false, false, false); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 2, 1, 1, false, false, false, false); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(m == 1) + { + if(n < 2048) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); + } + else if(n >= 2048 && n < 16384) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + if(m < 64) + { + return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 16, 1, 1, false, false, false, false); + } + } + } + else + { + if(m == 1) + { + if(n < 8192) + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 16, 1, 1, false, false, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 16, 1, 1, false, false, false, false); + } + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h new file mode 100644 index 0000000000..f0f812fd46 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h @@ -0,0 +1,59 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Valhall based OpenCL GEMMNative configuration */ +class ClGemmDefaultConfigNativeValhall final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigNativeValhall(GPUTarget gpu); + + // Inherited overridden method + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_NATIVE_VALHALL_H */ diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h new file mode 100644 index 0000000000..cf8412830b --- /dev/null +++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h @@ -0,0 +1,71 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H +#define ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h" +#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h" +#include "src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** CLGEMMNative factory class */ +class ClGemmNativeKernelConfigurationFactory final +{ +public: + /** Static method to construct CLGEMMNative kernel object accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return CLGEMMNative kernel configuration class + */ + static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) + { + switch(get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu); + case GPUTarget::BIFROST: + return std::make_unique<ClGemmDefaultConfigNativeBifrost>(gpu); + case GPUTarget::VALHALL: + return std::make_unique<ClGemmDefaultConfigNativeValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /*ARM_COMPUTE_CL_GEMM_NATIVE_KERNEL_CONFIGURATION_H */ diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp new file mode 100644 index 0000000000..657018eb53 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp @@ -0,0 +1,356 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G76: + func = configs_G76.get_function(data_type); + break; + case GPUTarget::G52: + func = configs_G52.get_function(data_type); + break; + default: + func = configs_G7x.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, false, true, false, true); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, true, false, false, true); + } + } + else + { + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 6, 4, 4, 2, 2, true, true, false, true); + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(workload <= 274.4000f) + { + if(r_nk <= 0.7461f) + { + if(r_mn <= 21.1667f) + { + return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(r_mk <= 17.3926f) + { + if(workload <= 542.4000f) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(r_nk <= 0.5463f) + { + if(workload <= 11767.6001f) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + + if(workload <= 323.4000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 2, 2, true, true, true, false, false); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + // Get lhs_info/rhs_info in case of OpenCL buffer + if(n <= 4) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 2, 8, 16, false, false, false, true); + } + + // Get lhs_info/rhs_info in case of OpenCL image + // Condition on the GPU workload + if((m / 4) * (n / 4) >= 2560) + { + // Big workload + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); + } + else + { + // Small workload + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); + } + + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); + const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); + const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); + + // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d + const bool use_cl_image2d = (n <= 4) ? false : true; + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + { + return std::make_pair(lhs_info_img, rhs_info_img); + } + else + { + return std::make_pair(lhs_info_buf, rhs_info_buf); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + + if(workload <= 1595.2000f) + { + if(r_mk <= 2.1044f) + { + if(workload <= 870.4000f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 2, 4, 2, 2, false, false, true, false, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 4, 2, true, true, true, false, false); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, false, true, false, true); + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h new file mode 100644 index 0000000000..d86d1ba0a7 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Bifrost based OpenCL GEMMReshaped configuration */ +class ClGemmDefaultConfigReshapedBifrost final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu); + + // Inherited overridden method + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_BIFROST_H */ diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp new file mode 100644 index 0000000000..58d0873b86 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp @@ -0,0 +1,538 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G78: + func = configs_G78.get_function(data_type); + break; + case GPUTarget::G77: + default: + func = configs_G77.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 4, 4, 2, 16, 0, 1, 0, 1); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); + + if(r_mk <= 0.11824845522642136) + { + if(workload <= 880.0) + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); + } + else + { + if(r_nk <= 0.42521367967128754) + { + if(workload <= 1726.4000244140625) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + else + { + if(workload <= 1241.6000366210938) + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); + } + } + } + } + else + { + if(workload <= 11404.7998046875) + { + if(r_mk <= 1.0126488208770752) + { + if(r_mn <= 2.545312523841858) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); + } + } + else + { + if(workload <= 2881.199951171875) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + else + { + if(r_nk <= 0.5765306055545807) + { + if(r_mn <= 6.010416746139526) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + + if(workload <= 1288.0000f) + { + if(workload <= 505.6000f) + { + if(r_mn <= 0.4466f) + { + if(r_nk <= 0.2384f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 2, 2, 0, 0, 1, 0, 0); + } + } + else + { + if(r_mn <= 0.2250f) + { + if(r_mn <= 0.1599f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + else + { + if(r_mk <= 0.7609f) + { + if(r_mn <= 2.5453f) + { + if(workload <= 1089.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 2, 4, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 4, 4, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + } + } + } + else + { + if(workload <= 5434.4001f) + { + if(workload <= 1603.2000f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(r_nk <= 0.6192f) + { + if(r_mn <= 16.1016f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(workload <= 2750.0000f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(r_mk <= 6.3151f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + } + } + else + { + if(r_mk <= 0.0387f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + else + { + if(r_mk <= 2.5859f) + { + if(r_mk <= 0.2734f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + } + } + } + else + { + if(r_mk <= 25.7500f) + { + if(r_mk <= 0.3615f) + { + if(r_mn <= 0.0913f) + { + if(r_mk <= 0.0683f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); + } + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + else + { + if(workload <= 11174.3999f) + { + if(r_mk <= 0.8047f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(workload <= 7185.5999f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); + } + } + } + else + { + if(workload <= 17917.5000f) + { + if(r_mk <= 1.5078f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); + } + } + else + { + if(workload <= 34449.6016f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1); + } + } + } + } + } + else + { + if(r_mk <= 331.1111f) + { + if(workload <= 53397.5996f) + { + if(r_mn <= 57.8063f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(r_nk <= 0.9211f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + } + } + else + { + if(workload <= 38070.4004f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + } + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + + if(workload <= 801.6000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + else + { + if(r_mn <= 0.1211f) + { + if(workload <= 3296.0000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + if(r_nk <= 1.0625f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 4, 0, 0, 1, 0, 1); + } + } + } + else + { + if(workload <= 5068.8000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + else + { + if(r_nk <= 0.2361f) + { + if(workload <= 12630.0000f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 1, 0, 0, 1, 0, 1); + } + } + else + { + if(workload <= 178790.3984f) + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); + } + } + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(n <= 4) + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 2, 2, 0, 1, 0, 1); + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h new file mode 100644 index 0000000000..466eda00a6 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Valhall based OpenCL GEMMReshaped configuration */ +class ClGemmDefaultConfigReshapedValhall final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedValhall(GPUTarget gpu); + + // Inherited overridden method + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_VALHALL_H */ diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h new file mode 100644 index 0000000000..1c32f1358b --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H +#define ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h" +#include "src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** CLGEMMReshaped factory class */ +class ClGemmReshapedKernelConfigurationFactory final +{ +public: + /** Static method to call the CLGEMMReshaped kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return CLGEMMReshaped kernel configuration class + */ + static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) + { + switch(get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + return std::make_unique<ClGemmDefaultConfigReshapedBifrost>(gpu); + case GPUTarget::VALHALL: + return std::make_unique<ClGemmDefaultConfigReshapedValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_KERNEL_CONFIGURATION_H */ diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp new file mode 100644 index 0000000000..9c23d9c998 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp @@ -0,0 +1,547 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + switch(_target) + { + case GPUTarget::G76: + func = configs_G76.get_function(data_type); + break; + case GPUTarget::G51: + func = configs_G51.get_function(data_type); + break; + case GPUTarget::G52: + func = configs_G52.get_function(data_type); + break; + case GPUTarget::G31: + func = configs_G31.get_function(data_type); + break; + default: + func = configs_G7x.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n <= 2548) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 8, false, true, false, true, false); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); + } + else + { + const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); + if(m >= 28) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1); + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + const bool is_workload_big = ((m * n * b) / 16) >= 2048; + + if(m == 1) + { + if(n >= 8192) + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + if(n <= 204) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true, false); + } + } + } + else + { + const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1)); + if(is_workload_big) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); + } + } + + // Get lhs_info/rhs_info in case of OpenCL image + const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1)); + if(is_workload_big) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); + } + + const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); + const TensorShape shape = compute_rhs_reshaped_shape(tensor_rhs_info, rhs_info_img); + const TensorInfo tensor_reshaped_info(shape, 1, DataType::F32); + + // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d + const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; + + if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + { + return std::make_pair(lhs_info_img, rhs_info_img); + } + else + { + return std::make_pair(lhs_info_buf, rhs_info_buf); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(m == 1) + { + if(r_nk <= 0.4664f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(workload <= 274.4000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int n0 = n < 1280 ? 2 : 4; + const unsigned int h0 = std::max(n / n0, 1U); + return configure_lhs_rhs_info(m, n, 1, n0, 4, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + if(n > 2048) + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, false, true, false, true); + } + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 4, false, true, false, true); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + if(m == 1) + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); + + if(r_mk <= 0.0026f) + { + if(r_nk <= 0.4664f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + else + { + if(r_mk <= 0.0148f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + else + { + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); + + if(workload <= 362.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); + } + else + { + if(r_mn <= 22.6067f) + { + if(workload <= 708.8000f) + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 16, false, false, false, false, false); + } + } + else + { + if(r_nk <= 0.0917f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); + } + else + { + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + + if(m == 1) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); + } + else + { + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + + if(workload <= 7449.60f) + { + if(workload <= 691.60f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false); + } + else + { + if(workload <= 4155.20f) + { + return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 8, 2, 1, 32, false, false, false, false, false); + } + } + } + else + { + if(workload <= 16300.80f) + { + if(r_mn <= 44.56f) + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + else + { + return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + } + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F16); + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int n0 = n < 1280 ? 2 : 4; + const unsigned int h0 = std::max(n / n0, 1U); + return configure_lhs_rhs_info(m, n, 1, n0, 8, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, true, false, true); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(dot8_supported(CLKernelLibrary::get().get_device())) + { + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 4, 1U); + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, false, true, false, true); + } + } + else + { + const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1)); + if(m == 1) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, 2, false, true, false, true); + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); + } + else + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 4, 2, 16, 1, h0, false, true, false, true); + } +} + +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h new file mode 100644 index 0000000000..321cbb5250 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h @@ -0,0 +1,68 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Bifrost based OpenCL GEMMReshapedOnlyRHS configuration */ +class ClGemmDefaultConfigReshapedRhsOnlyBifrost final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu); + + // Inherited overridden method + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_BIFROST_H */ diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp new file mode 100644 index 0000000000..a82084a8df --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp @@ -0,0 +1,570 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h" + +#include "arm_compute/core/CL/CLHelpers.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/TensorShape.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + +#include <utility> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +using namespace arm_compute::misc::shape_calculator; + +ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu) + : IClGemmKernelConfig(gpu) +{ +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +{ + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, + unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + + ConfigurationFunctionExecutorPtr func = nullptr; + + switch(_target) + { + case GPUTarget::G78: + func = configs_G78.get_function(data_type); + break; + case GPUTarget::G77: + default: + func = configs_G77.get_function(data_type); + break; + } + + ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); + return (this->*func)(m, n, k, b); +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + if(m == 1) + { + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + + if(r_mk <= 0.0064484127797186375) + { + if(r_mn <= 0.0028273810748942196) + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + + const unsigned int h0 = std::max(n / 4, 1U); + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 8, 0, 1, 0, 0, 0); + } + } + else + { + if(r_mk <= 0.020312500186264515) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, 0, 1, 0, 1, 0); + } + } + } + else + { + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + + if(workload <= 1999.2000122070312) + { + if(workload <= 747.1999816894531) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + if(r_mn <= 0.03348214365541935) + { + if(r_mk <= 0.028125000186264515) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 2, 0, 0, 0, 1, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + else + { + GEMMLHSMatrixInfo lhs_info_buf; + GEMMRHSMatrixInfo rhs_info_buf; + GEMMLHSMatrixInfo lhs_info_img; + GEMMRHSMatrixInfo rhs_info_img; + std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, 0, 1, 0, 0, 1); + std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0); + + return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), + std::make_pair(lhs_info_buf, rhs_info_buf), + n, k, b, DataType::F32); + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + if(n <= 836.0) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, h0, 0, 1, 0, 1, 0); + } + } + else if(m < 128) + { + const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); + if(k >= 512) + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); + } + } + else + { + const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); + if(n >= 64) + { + return configure_lhs_rhs_info(m, n, 4, 8, 4, 1, h0, 0, 1, 0, 0); + } + else + { + if(k >= 512) + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, 0, 1, 0, 0); + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + ARM_COMPUTE_UNUSED(k); + ARM_COMPUTE_UNUSED(b); + + if(m == 1) + { + const unsigned int h0 = std::max(n / 2, 1U); + return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); + } + else + { + const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); + if(m >= 28) + { + return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 16, 1, h0, 0, 1, 0, 1); + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + + if(m == 1) + { + if(workload <= 278.7000f) + { + if(workload <= 7.5000f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + if(r_mn <= 0.0031f) + { + if(workload <= 256.6000f) + { + if(workload <= 16.7500f) + { + if(r_nk <= 1.6671f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + else + { + if(r_mk <= 0.0027f) + { + if(r_mk <= 0.0014f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + if(workload <= 8.9500f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + } + } + else + { + if(workload <= 14.1500f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 0.0041f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); + } + } + } + } + } + } + else + { + if(workload <= 363.7000f) + { + if(r_mk <= 0.0031f) + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 32, 0, 1, 0, 1, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); + } + } + } + else + { + if(workload <= 1384.8000f) + { + if(workload <= 704.0000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(workload <= 16761.6006f) + { + if(r_mn <= 187.1250f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 4, 0, 0, 0, 1, 1); + } + } + else + { + if(r_mk <= 432.4630f) + { + return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 16, 0, 1, 0, 1, 1); + } + } + } + } +} + +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +{ + const float r_mn = static_cast<float>(m) / static_cast<float>(n); + const float r_mk = static_cast<float>(m) / static_cast<float>(k); + const float r_nk = static_cast<float>(n) / static_cast<float>(k); + const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; + + if(m == 1) + { + if(r_mn <= 0.0038f) + { + if(workload <= 353.9000f) + { + if(workload <= 278.7000f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(r_mk <= 0.0004f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(r_mk <= 0.0030f) + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + } + } + } + else + { + if(r_nk <= 1.9384f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 8, 4, 1, 8, 0, 1, 1, 0, 1); + } + } + } + else + { + if(r_nk <= 1.0368f) + { + return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, 0, 0, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 32, 0, 0, 1, 0, 0); + } + } + } + else + { + if(workload <= 1422.4000f) + { + if(workload <= 704.0000f) + { + return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0); + } + else + { + if(workload <= 1197.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + else + { + if(workload <= 1241.6000f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + } + } + } + else + { + if(workload <= 2769.6000f) + { + if(workload <= 1846.4000f) + { + if(r_mn <= 2.4927f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + } + else + { + if(r_mn <= 0.6261f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 3.4453f) + { + if(r_mn <= 1.4135f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + } + } + } + else + { + if(r_nk <= 0.0302f) + { + return configure_lhs_rhs_info(m, n, 2, 4, 8, 1, 8, 0, 1, 1, 0, 1); + } + else + { + if(r_mk <= 181.3750f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + if(workload <= 28035.2002f) + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + else + { + if(r_mk <= 808.6667f) + { + return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 0); + } + else + { + return configure_lhs_rhs_info(m, n, 2, 8, 8, 1, 16, 0, 1, 1, 0, 0); + } + } + } + } + } + } + } +} +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h new file mode 100644 index 0000000000..c5e80a7ddc --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h @@ -0,0 +1,61 @@ +/* + * Copyright (c) 2020-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H +#define ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** Valhall based OpenCL GEMMReshapedOnlyRHS configuration */ +class ClGemmDefaultConfigReshapedRhsOnlyValhall final : public IClGemmKernelConfig +{ +public: + /** Constructor + * + * @param[in] gpu GPU target + */ + ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu); + + // Inherited overridden method + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + +private: + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_DEFAULT_CONFIG_RESHAPED_RHS_ONLY_VALHALL_H */ diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h new file mode 100644 index 0000000000..1503e74eb6 --- /dev/null +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h @@ -0,0 +1,69 @@ +/* + * Copyright (c) 2019-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H +#define ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H + +#include "src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h" +#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h" +#include "src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +namespace gemm +{ +/** CLGEMMReshapedOnlyRHS factory class */ +class ClGemmReshapedOnlyRhsKernelConfigurationFactory final +{ +public: + /** Static method to call the CLGEMMReshapedOnlyRHS kernel configuration class accordingly with the GPU target + * + * @param[in] gpu GPU target + * + * @return CLGEMMReshapedOnlyRHS kernel configuration class + */ + static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) + { + switch(get_arch_from_target(gpu)) + { + case GPUTarget::MIDGARD: + case GPUTarget::BIFROST: + return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyBifrost>(gpu); + case GPUTarget::VALHALL: + return std::make_unique<ClGemmDefaultConfigReshapedRhsOnlyValhall>(gpu); + default: + ARM_COMPUTE_ERROR("Not supported GPU target"); + } + } +}; +} // namespace gemm +} // namespace kernels +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPED_ONLY_RHS_KERNEL_CONFIGURATION_H */ diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp new file mode 100644 index 0000000000..6b36cc34b4 --- /dev/null +++ b/src/gpu/cl/operators/ClActivation.cpp @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClActivation.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" + +#include "src/common/IOperator.h" +#include "src/common/utils/LegacySupport.h" +#include "src/gpu/cl/ClContext.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClActivationKernel>(); + k->configure(compile_context, src, dst, act_info); + _kernel = std::move(k); +} + +Status ClActivation::validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClActivationKernel::validate(src, dst, act_info); +} +} // namespace opencl + +namespace gpu +{ +namespace opencl +{ +std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) +{ + TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); + TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); + auto info = detail::convert_to_activation_info(act); + + if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + { + return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); + } + + auto act_op = std::make_unique<arm_compute::opencl::ClActivation>(); + act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info); + + auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); + if(op == nullptr) + { + ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); + return std::make_tuple(nullptr, StatusCode::OutOfMemory); + } + op->set_internal_operator(std::move(act_op)); + + return std::make_tuple(op, StatusCode::Success); +} +} // namespace opencl +} // namespace gpu +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h new file mode 100644 index 0000000000..75b38e8a00 --- /dev/null +++ b/src/gpu/cl/operators/ClActivation.h @@ -0,0 +1,56 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ACTIVATION_H +#define ARM_COMPUTE_CL_ACTIVATION_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClActivationKernel */ +class ClActivation : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM16/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] activation_info Activation layer parameters. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClActivation::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ACTIVATION_H */ diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp new file mode 100644 index 0000000000..e1a013a6b5 --- /dev/null +++ b/src/gpu/cl/operators/ClAdd.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClAdd.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); + _kernel = std::move(k); +} + +Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h new file mode 100644 index 0000000000..d99f983ed0 --- /dev/null +++ b/src/gpu/cl/operators/ClAdd.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ADD_H +#define ARM_COMPUTE_CL_ADD_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run arithmetic addition + * + * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @note The function performs an arithmetic addition between two tensors. + */ +class ClAdd : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * Valid configurations (src1,src2) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClAdd::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ADD_H */ diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp new file mode 100644 index 0000000000..8911d208a7 --- /dev/null +++ b/src/gpu/cl/operators/ClCast.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClCast.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClCastKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +{ + auto k = std::make_unique<kernels::ClCastKernel>(); + k->configure(compile_context, src, dst, policy); + _kernel = std::move(k); +} + +Status ClCast::validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy) +{ + return kernels::ClCastKernel::validate(src, dst, policy); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h new file mode 100644 index 0000000000..1b67ff7c8e --- /dev/null +++ b/src/gpu/cl/operators/ClCast.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CAST_H +#define ARM_COMPUTE_CL_CAST_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClCastKernel */ +class ClCast : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @note Input data type must be different than output data type. + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------------------------------| + * |U8 | S8, U16, S16, U32, S32, F16, F32 | + * |U16 | U8, S8, S16, U32, S32, F16, F32 | + * |S16 | U8, S8, U16, U32, S32, F16, F32 | + * |U32 | U8, S8, U16, S16, S32, F16, F32 | + * |S32 | U8, S8, U16, S16, U32, F16, F32 | + * |F16 | U8, S8, U16, S16, U32, F32 | + * |F32 | U8, S8, U16, S16, U32, F16 | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The source tensor to convert. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. + * @param[in] policy Conversion policy. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCast::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, ConvertPolicy policy); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CAST_H */ diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp new file mode 100644 index 0000000000..731d9b5054 --- /dev/null +++ b/src/gpu/cl/operators/ClConcatenate.cpp @@ -0,0 +1,247 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClConcatenate.h" + +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h" +#include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h" +#include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h" +#include "src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h" +#include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" +#include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "src/core/helpers/AutoConfiguration.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis) +{ + ARM_COMPUTE_ERROR_ON(dst == nullptr); + _axis = axis; + _num_inputs = src_vector.size(); + + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); + std::vector<const ITensorInfo *> const_src_vector(src_vector.size()); + std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t; + }); + + // dst auto inizialitation if not yet initialized + auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type()); + ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis)); + + unsigned int offset = 0; + switch(_axis) + { + case Window::DimX: + { + switch(_num_inputs) + { + case 2: + { + // Configure WidthConcatenate2Tensors kernel + auto kernel = std::make_unique<kernels::ClWidthConcatenate2TensorsKernel>(); + kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + case 4: + { + // Configure WidthConcatenate4Tensors kernel + auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>(); + kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst); + _concat_kernels.emplace_back(std::move(kernel)); + break; + } + default: + { + // Configure generic case WidthConcatenate kernels + for(unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + } + break; + } + case Window::DimY: + { + for(unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + case Window::DimZ: + { + for(unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + case 3: + { + for(unsigned int i = 0; i < _num_inputs; ++i) + { + auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>(); + kernel->configure(compile_context, src_vector.at(i), offset, dst); + offset += src_vector.at(i)->dimension(_axis); + _concat_kernels.emplace_back(std::move(kernel)); + } + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } +} + +Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis) +{ + ARM_COMPUTE_RETURN_ERROR_ON(dst == nullptr); + const unsigned int num_inputs = src_vector.size(); + + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); + + unsigned int offset = 0; + switch(axis) + { + case Window::DimX: + { + switch(num_inputs) + { + case 2: + // Validate WidthConcatenate2Tensors kernels if there are 2 inputs + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); + break; + case 4: + // Validate WidthConcatenate4Tensors kernels if there are 4 inputs + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); + break; + default: + // Validate generic case of WidthConcatenate kernel + for(const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + break; + } + case Window::DimY: + { + for(const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + case Window::DimZ: + { + for(const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + case 3: + { + for(const auto &src : src_vector) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst)); + offset += src->dimension(axis); + } + break; + } + default: + ARM_COMPUTE_ERROR("Axis not supported"); + } + + if(dst->total_size() != 0) + { + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis); + ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); + } + + return Status{}; +} + +void ClConcatenate::run(ITensorPack &tensors) +{ + if(tensors.empty()) + { + ARM_COMPUTE_ERROR("No inputs provided"); + } + + if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs)) + { + ARM_COMPUTE_ERROR("Configured with different number of inputs"); + } + + if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) + { + ARM_COMPUTE_ERROR_ON(_concat_kernels.empty()); + CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true); + } + else + { + int i = 0; + for(auto &k : _concat_kernels) + { + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); + pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(ACL_DST)); + CLScheduler::get().enqueue_op(*k, pack, true); + ++i; + } + } +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h new file mode 100644 index 0000000000..de0cf84d2c --- /dev/null +++ b/src/gpu/cl/operators/ClConcatenate.h @@ -0,0 +1,79 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLCONCATENATE_H +#define ARM_COMPUTE_CLCONCATENATE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <vector> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to execute concatenate tensors along a given axis. This function calls the following kernels: + * + * -# @ref kernels::ClWidthConcatenateKernel (if underlying concatenation axis is 0). + * -# @ref kernels::ClHeightConcatenateKernel (if underlying concatenation axis is 1). + * -# @ref kernels::ClDepthConcatenateKernel (if underlying concatenation axis is 2). + * -# @ref kernels::ClBatchConcatenateKernel (if underlying concatenation axis is 3). + */ +class ClConcatenate : public IClOperator +{ +public: + ClConcatenate() = default; + /** Initialise the kernel's inputs vector and dst. + * + * @note Input and dst tensor dimensions preconditions defer depending on the concatenation axis. + * @note Preconditions can be found respectively at @ref kernels::ClWidthConcatenateKernel, + * @ref kernels::ClHeightConcatenateKernel and @ref kernels::ClDepthConcatenateKernel. + * + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] src_vector The vectors containing all the tensors info to concatenate. Data types supported: All + * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector. + * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. + */ + void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClConcatenate::configure() + * + * @return a status + */ + static Status validate(const std::vector<const ITensorInfo *> &src_vector, const ITensorInfo *dst, size_t axis); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + +private: + std::vector<std::unique_ptr<IClKernel>> _concat_kernels{}; + unsigned int _num_inputs{ 0 }; + unsigned int _axis{ 0 }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CONCATENATE_H */ diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp new file mode 100644 index 0000000000..c91a4831a8 --- /dev/null +++ b/src/gpu/cl/operators/ClConv2d.cpp @@ -0,0 +1,292 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClConv2d.h" + +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" +#include "src/gpu/cl/operators/ClDirectConv2d.h" +#include "src/gpu/cl/operators/ClGemmConv2d.h" +#include "src/gpu/cl/operators/ClWinogradConv2d.h" + +#include <memory> + +namespace +{ +/** Get the suitable kernel size for using direct convolution method with NHWC data layout. + * + * @note Direct convolution should be executed when the kernel has the spatial dimensions greater than or equal to the value returned by this function + * + * @param[in] gpu_target GPU target + * + * @return the suitable kernel size for using direct convolution method with NHWC data layout + */ +size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) +{ + switch(gpu_target) + { + case arm_compute::GPUTarget::G76: + case arm_compute::GPUTarget::G77: + case arm_compute::GPUTarget::G78: + return 5; + case arm_compute::GPUTarget::G71: + case arm_compute::GPUTarget::G72: + case arm_compute::GPUTarget::MIDGARD: + case arm_compute::GPUTarget::BIFROST: + return 7; + default: + return 5; + } +} +} // namespace + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::misc::shape_calculator; + +ClConv2d::ClConv2d() + : _operator() +{ +} + +ClConv2d::~ClConv2d() = default; + +void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); + + switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) + { + case ConvolutionMethod::WINOGRAD: + { + ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); + auto f = std::make_unique<ClWinogradConv2d>(); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math); + _operator = std::move(f); + break; + } + case ConvolutionMethod::DIRECT: + { + ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); + auto f = std::make_unique<ClDirectConv2d>(); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info); + _operator = std::move(f); + break; + } + case ConvolutionMethod::GEMM: + { + auto f = std::make_unique<ClGemmConv2d>(); + f->configure(compile_context, src, weights, biases, dst, conv2d_info, weights_info); + _operator = std::move(f); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + _aux_mem = _operator->workspace(); +} + +Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + + const GPUTarget gpu_target = CLScheduler::get().target(); + + switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) + { + case ConvolutionMethod::WINOGRAD: + { + //Validate Winograd + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math)); + break; + } + case ConvolutionMethod::DIRECT: + { + // Validate direct convolution layer + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + break; + } + case ConvolutionMethod::GEMM: + { + // Validate gemm-based convolution layer + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info)); + break; + } + default: + ARM_COMPUTE_ERROR("Not supported."); + break; + } + + return Status{}; +} + +ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, const GPUTarget gpu_target) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + ARM_COMPUTE_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_ERROR_ON_NULLPTR(weights); + ARM_COMPUTE_UNUSED(weights_info); + + const PadStrideInfo conv_info = conv2d_info.conv_info; + const ActivationLayerInfo act_info = conv2d_info.act_info; + const Size2D dilation = conv2d_info.dilation; + bool enable_fast_math = conv2d_info.enable_fast_math; + + const size_t idx_w = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_h = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + const size_t idx_c = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); + + /* Input spatial dims, kernel size, IFM/OFM, conv info*/ + using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>; + using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; + + const std::vector<ConfigurationMethod> known_configs = + { + // Alexnet + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + // VGG16 / VGG19 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + // Mobilenet 224 + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + // Mobilenet 160 + ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + }; + + const auto find_config = [&](ConfigurationMethod c) + { + const ConvolutionConfiguration config = c.first; + const PadStrideInfo info = std::get<3>(config); + const DataLayout data_layout = std::get<4>(config); + + return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) + && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() + && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout()); + }; + + std::vector<ConfigurationMethod>::const_iterator found; + if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + { + return (*found).second; + } + + if(dilation != Size2D(1U, 1U)) + { + return ConvolutionMethod::GEMM; + } + else + { + if(src->data_layout() == DataLayout::NCHW) + { + // SRGAN + if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) + && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) + { + return ConvolutionMethod::DIRECT; + } + if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) + { + return ConvolutionMethod::FFT; + } + if(src->dimension(idx_c) < 16) + { + return ConvolutionMethod::GEMM; + } + return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; + } + else + { + const bool is_direct_valid = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_wino_valid = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); + const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); + + // SRGAN case + if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) + && is_direct_valid) + { + return ConvolutionMethod::DIRECT; + } + + // Floating-point case: GeMM/Direct/Winograd + if(is_data_type_float(src->data_type())) + { + const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); + const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; + const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + + // Run Winograd if valid and IFM >= 16 + if(is_wino_valid && is_ifm_ge_16) + { + return ConvolutionMethod::WINOGRAD; + } + // Run Direct for Large kernel size + if(is_large_kernel_sz && is_ifm_ge_16 && is_direct_valid && is_ifm_gt_ofm) + { + return ConvolutionMethod::DIRECT; + } + + // Default case + return ConvolutionMethod::GEMM; + } + + // Generic case for quantized. Only GeMM + return ConvolutionMethod::GEMM; + } + } +} + +void ClConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + _operator->run(tensors); +} + +void ClConv2d::prepare(ITensorPack &tensors) +{ + _operator->prepare(tensors); +} + +experimental::MemoryRequirements ClConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h new file mode 100644 index 0000000000..1c3a81c77a --- /dev/null +++ b/src/gpu/cl/operators/ClConv2d.h @@ -0,0 +1,152 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CLCONV2D_H +#define ARM_COMPUTE_CLCONV2D_H + +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions: + * + * -# @ref opencl::ClGemmConv2d + * -# @ref opencl::ClWinogradConv2d + * -# @ref opencl::ClDirectConv2d + * -# @ref CLFFTConvolutionLayer + * + * The function selects one of the algorithms mentioned above based on: + * - The size of the kernel + * - Number of src/dst feature maps + * - Amount of memory needed + * + * Generally GEMM-based convolution is executed when neither Winograd nor FFT nor Direct convolution can be performed. + * + * FP32 Algorithm| Filter Size | Input/Output feature maps | + * --------------|-------------------------------------------------------------|-------------------------------------------| + * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5(fast maths) 7x1 1x7 | Input channels is greater than 3 | + * FFT | Squared kernels and greater than 9x9 | Input feature maps > Output feature maps | + * DirectConv | 9x9 | | + * GEMM | Any size | | + * + * Winograd 5x5 requires fast maths enabled. + * + * FP16 Algorithm| Filter Size | Input/Output feature maps | + * --------------|----------------------------|-------------------------------------------| + * Winograd | 3x3 1x3 3x1 5x1 1x5 5x5 | Input channels is greater than 3 | + * FFT | Not supported | | + * DirectConv | 9x9 | | + * GEMM | Any size | | + * + * Winograd FP16 requires fast maths enabled. + * + */ +class ClConv2d : public IClOperator +{ +public: + /** Default constructor */ + ClConv2d(); + /** Default Destructor */ + ~ClConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClConv2d(const ClConv2d &) = delete; + /** Default move constructor */ + ClConv2d(ClConv2d &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClConv2d &operator=(const ClConv2d &) = delete; + /** Default move assignment operator */ + ClConv2d &operator=(ClConv2d &&) = default; + /** Set the src and dst tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of srcs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Same as @p src, except for src of QASYMM8/QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. + * Data types supported: Same as @p src. + * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d + * + * Similar to ClConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will return the convolution called by @ref ClConv2d + * + * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of srcs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p src, also could be QSYMM8_PER_CHANNEL if src is QASYMM8/QASYMM8_SIGNED. + * @param[in] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. + * Data types supported: Same as @p src. + * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. + * @param[in] gpu_target Specifies the @p GPUTarget. + * + * @return the Convolution Method Hint + */ + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, const GPUTarget gpu_target); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + std::unique_ptr<IClOperator> _operator; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLCONV2D_H */ diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp new file mode 100644 index 0000000000..61e33f2fdb --- /dev/null +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +{ + auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>(); + k->configure(compile_context, src, dst, original_src_shape, data_layout); + _kernel = std::move(k); +} + +Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +{ + return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h new file mode 100644 index 0000000000..2794eb17b0 --- /dev/null +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H +#define ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClConvertFullyConnectedWeightsKernel */ +class ClConvertFullyConnectedWeights : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). + * @param[in] data_layout The data layout the weights have been trained in. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClConvertFullyConnectedWeights::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CONVERT_FULLYCONNECTED_WEIGHTS_H */ diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp new file mode 100644 index 0000000000..c1a9f264b6 --- /dev/null +++ b/src/gpu/cl/operators/ClCopy.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClCopy.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClCopyKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClCopy::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window) +{ + auto k = std::make_unique<kernels::ClCopyKernel>(); + k->configure(compile_context, src, dst, dst_window); + _kernel = std::move(k); +} + +Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window) +{ + return kernels::ClCopyKernel::validate(src, dst, dst_window); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h new file mode 100644 index 0000000000..9b427f9675 --- /dev/null +++ b/src/gpu/cl/operators/ClCopy.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_COPY_H +#define ARM_COMPUTE_CL_COPY_H + +#include "arm_compute/core/Window.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClCopyKernel */ +class ClCopy : public IClOperator +{ +public: + /** Initialise the function's source and destination. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: All. + * @param[out] dst Output tensor info. Data types supported: Same as @p src. + * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. + * + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCopy::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Window *dst_window = nullptr); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_COPY_H */ diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp new file mode 100644 index 0000000000..a6a1c8b103 --- /dev/null +++ b/src/gpu/cl/operators/ClCrop.cpp @@ -0,0 +1,46 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClCrop.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClCropKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, + Window *dst_window) +{ + auto k = std::make_unique<kernels::ClCropKernel>(); + k->configure(compile_context, src, dst, start, end, batch_index, extrapolation_value, dst_window); + _kernel = std::move(k); +} + +Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +{ + return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h new file mode 100644 index 0000000000..1cf1c9bff4 --- /dev/null +++ b/src/gpu/cl/operators/ClCrop.h @@ -0,0 +1,65 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_CROP_H +#define ARM_COMPUTE_CL_CROP_H + +#include "arm_compute/core/Window.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClCropKernel */ +class ClCrop : public IClOperator +{ +public: + /** Initialise the function's source and destination. + * + * @note Supported tensor rank: up to 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data type supported: All. Data layouts supported: NHWC. + * @param[out] dst Destination tensor info. Data type supported: F32 + * @param[in] start Coordinates of where to start cropping the image. + * @param[in] end Coordinates of where to end cropping the image. + * @param[in] batch_index Fourth dimension index of the 3D image to crop in @p src. + * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. + * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, + Window *dst_window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClCrop::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, + Window *dst_window = nullptr); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_CROP_H */ diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp new file mode 100644 index 0000000000..dbaa5f67df --- /dev/null +++ b/src/gpu/cl/operators/ClDequantize.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClDequantize.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClDequantizeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClDequantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClDequantizeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClDequantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClDequantizeKernel::validate(src, dst); +} + +void ClDequantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + CLScheduler::get().enqueue_op(*_kernel.get(), tensors); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDequantize.h b/src/gpu/cl/operators/ClDequantize.h new file mode 100644 index 0000000000..ccaac2cd49 --- /dev/null +++ b/src/gpu/cl/operators/ClDequantize.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DEQUANTIZE_H +#define ARM_COMPUTE_CL_DEQUANTIZE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClDequantizeKernel that dequantizes an input tensor */ +class ClDequantize : public IClOperator +{ +public: + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/QSYMM8_PER_CHANNEL/QSYMM8/QSYMM16. + * @param[out] dst Destination tensor info with the same dimensions of @p src. Data type supported: F16/F32. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClDequantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited method overridden + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DEQUANTIZE_H */ diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp new file mode 100644 index 0000000000..50e63beedc --- /dev/null +++ b/src/gpu/cl/operators/ClDirectConv2d.cpp @@ -0,0 +1,106 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClDirectConv2d.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" +#include "src/gpu/cl/kernels/ClDirectConv2dKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace +{ +ITensorPack select_activation_src_dst(ITensorPack &tensors) +{ + ITensorPack pack; + pack.add_tensor(TensorType::ACL_SRC, tensors.get_tensor(TensorType::ACL_DST)); + pack.add_tensor(TensorType::ACL_DST, tensors.get_tensor(TensorType::ACL_DST)); + return pack; +} +} // namespace + +void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + + // Configure direct convolution kernel + const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo(); + auto k = std::make_unique<kernels::ClDirectConv2dKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info); + _direct_conv_kernel = std::move(k); + + // Configure border handler + PixelValue zero_value(0.f); + if(is_data_type_quantized_asymmetric(src->data_type())) + { + zero_value = PixelValue(0, src->data_type(), src->quantization_info()); + } + auto b = std::make_unique<CLFillBorderKernel>(); + b->configure(compile_context, src, _direct_conv_kernel->border_size(), BorderMode::CONSTANT, zero_value); + _src_border_handler = std::move(b); + + // Fused activation is currently supported for NHWC and floating point types + if(act_info.enabled() && !conv2d_act_info.enabled()) + { + auto a = std::make_unique<kernels::ClActivationKernel>(); + a->configure(compile_context, dst, dst, act_info); + _activation_kernel = std::move(a); + } + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); +} + +Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), CLScheduler::get().target())); + if(act_info.enabled()) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); + } + return Status{}; +} + +void ClDirectConv2d::run(ITensorPack &tensors) +{ + // Run border handler + CLScheduler::get().enqueue_op(*_src_border_handler.get(), tensors, false); + // Run direct convolution + CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); + // Run activation kernel + if(_activation_kernel) + { + auto act_pack = select_activation_src_dst(tensors); + CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); + } +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h new file mode 100644 index 0000000000..85365b76ff --- /dev/null +++ b/src/gpu/cl/operators/ClDirectConv2d.h @@ -0,0 +1,82 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_DIRECT_CONV2D_H +#define ARM_COMPUTE_CL_DIRECT_CONV2D_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a directly convolution layer. This function calls the following OpenCL kernels: + * + * -# @ref CLFillBorderKernel (executed if padding size is different from zero) + * -# @ref opencl::ClDirectConv2d + */ +class ClDirectConv2d : public IClOperator +{ +public: + ClDirectConv2d() = default; + /** Set the src and dst tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. 3 lower dimensions represent a single src [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of srcs. + * Data types supported: QASYMM8_SIGNED/QASYMM8/F16/F32. + * @param[in] weights Weights tensor. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. + * @param[in] biases Biases tensor. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p src data type, except for src of QASYMM8 and QASYMM8_SIGNED type where biases should be of S32 type. + * @param[out] dst Destination tensor. 3 lower dimensions represent a single dst [width, height, OFM], while the rest represent batch of dsts. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClDirectConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr }; + std::unique_ptr<IClKernel> _src_border_handler{ nullptr }; + std::unique_ptr<IClKernel> _activation_kernel{ nullptr }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_DIRECT_CONV2D_H */
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp new file mode 100644 index 0000000000..4e4cd5ae9d --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp @@ -0,0 +1,92 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClElementwiseOperations.h" + +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::DIV, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info); +} + +void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::MAX, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info); +} + +void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::MIN, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info); +} + +void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); +} + +void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::POWER, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h new file mode 100644 index 0000000000..304b250d66 --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseOperations.h @@ -0,0 +1,165 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H +#define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for division + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs an arithmetic division between two tensors. + */ +class ClElementwiseDivision : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: F16/F32. + * @param[in] src2 Second source tensor info. same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseDivision::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * @note The function performs a max operation between two tensors. + */ +class ClElementwiseMax : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseMax::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min + * + * @note The tensor data type for the inputs must be U8/QASYMM8/S16/QSYMM16/S32/U32/F16/F32. + * @note The function performs a max operation between two tensors. + */ +class ClElementwiseMin : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/U32/F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseMin::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference + * + * @note The tensor data type for the inputs must be QASYMM8/U8/S16/QSYMM16/F16/F32. + * @note The function performs a squared different operation between two tensors (i.e., out[i] = (in1[i] - in2[i])^2 + */ +class ClElementwiseSquaredDiff : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. + * @param[out] dst Destination tensor info. Data types supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwiseSquaredDiff::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power + * + * @note The tensor data type for the inputs must be F16/F32. + * @note The function performs an elementwise power of in1 to in2 (i.e., out[i] = in1[i] ^ in2[i]) + */ +class ClElementwisePower : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src1 First source tensor info. Data types supported: F16/F32. + * @param[in] src2 Second source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported:F16/F32. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClElementwisePower::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H */ diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp new file mode 100644 index 0000000000..24a603e8c3 --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp @@ -0,0 +1,116 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClElementwiseUnary.h" + +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClRsqrt::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::RSQRT); + _kernel = std::move(k); +} + +Status ClRsqrt::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::RSQRT); +} + +void ClExp::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::EXP); + _kernel = std::move(k); +} + +Status ClExp::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::EXP); +} + +void ClNeg::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::NEG); + _kernel = std::move(k); +} + +Status ClNeg::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::NEG); +} + +void ClSin::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::SIN); + _kernel = std::move(k); +} + +Status ClSin::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::SIN); +} + +void ClAbs::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::ABS); + _kernel = std::move(k); +} + +Status ClAbs::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ABS); +} + +void ClLog::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::LOG); + _kernel = std::move(k); +} + +Status ClLog::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOG); +} + +void ClRound::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::ROUND); + _kernel = std::move(k); +} + +Status ClRound::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::ROUND); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClElementwiseUnary.h b/src/gpu/cl/operators/ClElementwiseUnary.h new file mode 100644 index 0000000000..a23b789ab5 --- /dev/null +++ b/src/gpu/cl/operators/ClElementwiseUnary.h @@ -0,0 +1,175 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H +#define ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to perform inverse square root on an src tensor. */ +class ClRsqrt : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClRsqrt::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to perform exponential on an src tensor. */ +class ClExp : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClExp::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to negate an src tensor. */ +class ClNeg : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClNeg::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to calculate sine of an src tensor. */ +class ClSin : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClSin::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to perform elementwise log on an src tensor. */ +class ClLog : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClLog::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to get the absolute value of an src tensor. */ +class ClAbs : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClAbs::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; + +/** Basic function to get the round (to the nearest even) value of an src tensor. */ +class ClRound : public IClOperator +{ +public: + /** Initialize the function + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClRound::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_ELEMENTWISE_UNARY_H */ diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp new file mode 100644 index 0000000000..9e006c1649 --- /dev/null +++ b/src/gpu/cl/operators/ClFill.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFill.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClFillKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) +{ + auto k = std::make_unique<kernels::ClFillKernel>(); + k->configure(compile_context, tensor, constant_value, dst_window); + _kernel = std::move(k); +} + +Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) +{ + return kernels::ClFillKernel::validate(tensor, constant_value, dst_window); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h new file mode 100644 index 0000000000..c9289b2b95 --- /dev/null +++ b/src/gpu/cl/operators/ClFill.h @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FILL_H +#define ARM_COMPUTE_CL_FILL_H + +#include "arm_compute/core/Window.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClFillKernel */ +class ClFill : public IClOperator +{ +public: + /** Initialise the kernel's tensor and filling value + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] tensor Source tensor info. Supported data types: All. + * @param[in] constant_value The value used to fill the planes of the tensor + * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClFill::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FILL_H */ diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp new file mode 100644 index 0000000000..3283454fd6 --- /dev/null +++ b/src/gpu/cl/operators/ClFlatten.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFlatten.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClReshapeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClFlatten::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClReshapeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClFlatten::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClReshapeKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFlatten.h b/src/gpu/cl/operators/ClFlatten.h new file mode 100644 index 0000000000..d2ce3b701d --- /dev/null +++ b/src/gpu/cl/operators/ClFlatten.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FLATTEN_H +#define ARM_COMPUTE_CL_FLATTEN_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to flatten a given input */ +class ClFlatten : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src |dst | + * |:--------------|:--------------| + * |All |All | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor to flatten with at least 3 dimensions. + * The dimensions above the third will be interpreted as batches. Data types supported: All + * @param[in] dst Destination tensor with shape [w*h*d, input_batches] where: + * w = width input tensor, h = height input tensor and d = depth input tensor. + * Data type supported: same as @p src + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClFlatten::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FLATTEN_H */ diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp new file mode 100644 index 0000000000..866bff2fad --- /dev/null +++ b/src/gpu/cl/operators/ClFloor.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFloor.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClFloorKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClFloor::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClFloorKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClFloor::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClFloorKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFloor.h b/src/gpu/cl/operators/ClFloor.h new file mode 100644 index 0000000000..746147335e --- /dev/null +++ b/src/gpu/cl/operators/ClFloor.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FLOOR_H +#define ARM_COMPUTE_CL_FLOOR_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClFloorKernel */ +class ClFloor : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: F16/F32. + * @param[in] dst Destination tensor info. Data type supported: same as @p src + */ + void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClFloor::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FLOOR_H */ diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp new file mode 100644 index 0000000000..8b7e336c9f --- /dev/null +++ b/src/gpu/cl/operators/ClFullyConnected.cpp @@ -0,0 +1,496 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClFullyConnected.h" + +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" + +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" +#include "src/gpu/cl/operators/ClFlatten.h" +#include "src/gpu/cl/operators/ClGemm.h" +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" +#include "src/gpu/cl/operators/ClTranspose.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" + +#include "support/Cast.h" + +#include <algorithm> + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::experimental; +using namespace arm_compute::misc::shape_calculator; + +namespace +{ +Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst, + GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) +{ + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.gemmlowp_multiplier = 0; + gemmlowp_output_stage.gemmlowp_shift = 0; + + const auto data_type = src.data_type(); + + // Configure output stage for quantized case + if(is_data_type_quantized_asymmetric(data_type)) + { + const QuantizationInfo oq_info = dst.quantization_info(); + const UniformQuantizationInfo iq_unif = src.quantization_info().uniform(); + const UniformQuantizationInfo wq_unif = weights.quantization_info().uniform(); + const UniformQuantizationInfo oq_unif = oq_info.uniform(); + + const auto output_quant_info = (dst.total_size() == 0) ? iq_unif : oq_unif; + + const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; + int output_multiplier = 0; + int output_shift = 0; + ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + + PixelValue type_min{}; + PixelValue type_max{}; + std::tie(type_min, type_max) = get_min_max(data_type); + + if(activation_info.enabled()) + { + std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info); + } + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_multiplier = output_multiplier; + gemmlowp_output_stage.gemmlowp_shift = output_shift; + gemmlowp_output_stage.gemmlowp_multipliers.push_back(output_multiplier); + gemmlowp_output_stage.gemmlowp_shifts.push_back(output_shift); + type_min.get(gemmlowp_output_stage.gemmlowp_min_bound); + type_max.get(gemmlowp_output_stage.gemmlowp_max_bound); + } + + return Status{}; +} + +Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + ActivationLayerInfo()); // activation_info + + if(is_data_type_quantized_asymmetric(src.data_type())) + { + const UniformQuantizationInfo iq_info = src.quantization_info().uniform(); + const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); + + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate src and weights offset + const QuantizationInfo src_quantization_info(iq_info.scale, -iq_info.offset); + const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); + + // Validate gemmlowp function + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), + bias, + &dst, + gemm_info)); + } + else + { + ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&src, &weights, bias, &dst, 1.f, 1.f, gemm_info)); + } + + return Status{}; +} +} // namespace + +ClFullyConnected::ClFullyConnected() + : _convert_weights(nullptr), + _flatten(nullptr), + _reshape_weights(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _aux_mem(Count) +{ +} + +ClFullyConnected::~ClFullyConnected() = default; + +void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + GEMMLowpOutputStageInfo gemmlowp_output_stage; + construct_gemmlowp_output_stage(*src, *weights, *dst, gemmlowp_output_stage, fc_info.activation_info); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + 0, // depth_output_gemm3d + false, // reinterpret_input_as_3d + fc_info.retain_internal_weights, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + fc_info.fp_mixed_precision, // fp_mixed_precision + false, // fast_math + true, // broadcast_bias + fc_info.activation_info, // activation_info + fc_info.constant_weights); // constant_weights + + if(_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo src_quantization_info = src->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->quantization_info(); + + TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); + TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); + + src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); + weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Configure gemmlowp function + _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(compile_context, &src_info, &weights_info, bias, dst, gemm_info); + } + else + { + // Configure matrix multiply kernel + _mm_gemm = std::make_unique<ClGemm>(); + _mm_gemm->configure(compile_context, src, weights, bias, dst, 1.f, 1.f, gemm_info); + } +} + +void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON((weights->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // If the fully connected layer is called after a convolution layer, the input tensor must be linearized + + // Initialize output tensor for flatten + _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW); + + // Configure flatten kernel + _flatten = std::make_unique<ClFlatten>(); + _flatten->configure(compile_context, src, &_flattened_src); + + // Configure matrix multiply kernel + configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info); +} + +void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info) +{ + ARM_COMPUTE_ERROR_ON(src->dimension(0) != weights->dimension(1)); + + // Configure matrix multiply kernel + configure_mm(compile_context, src, weights, bias, dst, fc_info); +} + +void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + // Perform validate step + ARM_COMPUTE_ERROR_THROW_ON(ClFullyConnected::validate(src, weights, biases, dst, fc_info)); + + _are_weights_converted = true; + _are_weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + _is_fc_after_conv = true; + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _is_prepared = fc_info.retain_internal_weights; + _weights_to_use = TensorInfo(*weights); + _weights_to_use_idx = ACL_SRC_1; + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + if(is_batched_fc_layer) + { + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, + src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + _is_fc_after_conv = src->num_dimensions() > 1; + } + + ITensorInfo *weights_used = weights; + + // Reshape weights if needed + if(!_are_weights_reshaped) + { + // Reshape the weights + _reshape_weights = std::make_unique<ClTranspose>(); + _reshape_weights->configure(compile_context, weights, &_reshaped_weights); + weights_used = &_reshaped_weights; + _weights_to_use_idx = offset_int_vec(TransposedWeights); + } + + // Convert weights if needed + if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Convert weights + _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>(); + _convert_weights->configure(compile_context, + weights_used, + &_converted_weights, + src->tensor_shape(), + fc_info.weights_trained_layout); + + weights_used = &_converted_weights; + _weights_to_use_idx = offset_int_vec(ConvertedWeights); + _are_weights_converted = false; + } + + if(_is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info); + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + configure_fc_fc(compile_context, src, weights_used, biases, dst, fc_info); + } + // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion) + _weights_to_use = *weights_used; + + // Set auxiliary memory requirements + auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); + for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + { + _aux_mem[i] = gemm_mem_req[i]; + } + if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), MemoryLifetime::Prepare, _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Prepare, _converted_weights.total_size()); + } + else + { + // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch + const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; + const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; + + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), transposed_wei_lft, _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), converted_wei_lft, _converted_weights.total_size()); + } + _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); +} + +Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); + ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU + && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON(!fc_info.constant_weights && (!fc_info.are_weights_reshaped || fc_info.transpose_weights)); + + bool weights_reshaped = fc_info.transpose_weights ? fc_info.are_weights_reshaped : true; + bool is_fc_after_conv = true; + + const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = weights_reshaped ? TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()) : TensorInfo(*reshaped_weights.clone()); + + // With the Fully Connected layer we can have 4 different cases: + // 1) Convolution layer -> Fully Connected layer without batches + // 2) Fully Connected layer -> Fully Connected layer without batches + // 3) Convolution layer -> Fully Connected layer with batches + // 4) Fully Connected layer -> Fully Connected layer with batches + + const ITensorInfo *src_to_use = src; + const ITensorInfo *weights_to_use = weights; + + // Check if we have a fully connected layer with batches + const bool is_batched_fc_layer = dst->dimension(1) > 1; + if(is_batched_fc_layer) + { + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, + src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); + } + else + { + is_fc_after_conv = src->num_dimensions() > 1; + } + + if(!weights_reshaped) + { + // Validate reshape weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights)); + weights_to_use = &reshaped_weights; + } + + if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + { + // Validate convert weights kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use, + &converted_weights, + src->tensor_shape(), + fc_info.weights_trained_layout)); + weights_to_use = &converted_weights; + } + + if(is_fc_after_conv) + { + // Fully Connected layer after a Convolution Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + + // Validate flatten kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src)); + src_to_use = &flatten_src; + } + else + { + // Fully Connected layer after a Fully Connected Layer without batches + ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != weights_to_use->dimension(1)); + } + + // Validate matrix multiply kernel + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(*src_to_use, *weights_to_use, biases, *dst, fc_info)); + + return Status{}; +} + +void ClFullyConnected::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto src = tensors.get_const_tensor(ACL_SRC_0); + + CLAuxTensorHandler flattened_src(offset_int_vec(FlattenedSrc), _flattened_src, tensors, false); + CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false); + + // Linearize input if it comes from a convolutional layer + if(_is_fc_after_conv) + { + ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + _flatten->run(flatten_pack); + } + + ITensorPack gemm_pack = tensors; + gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); + if(_weights_to_use_idx != ACL_SRC_1) + { + gemm_pack.add_const_tensor(ACL_SRC_1, weights.get()); + } + + // Run matrix multiply + if(_is_quantized) + { + _mm_gemmlowp->run(gemm_pack); + } + else + { + _mm_gemm->run(gemm_pack); + } +} + +void ClFullyConnected::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + auto weights = tensors.get_const_tensor(ACL_SRC_1); + + CLAuxTensorHandler reshaped_weights(offset_int_vec(TransposedWeights), _reshaped_weights, tensors, false); + CLAuxTensorHandler converted_weights(offset_int_vec(ConvertedWeights), _converted_weights, tensors, false); + + // Pointer to current weights + const ITensor *cur_weights = weights; + + // Reshape of the weights if needed (happens only once) + if(!_are_weights_reshaped) + { + // Run reshape weights kernel and mark weights as unused + ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; + _reshape_weights->run(transpose_pack); + + cur_weights->mark_as_unused(); + cur_weights = reshaped_weights.get(); + + _are_weights_reshaped = true; + } + + // Convert weights if needed (happens only once) + if(!_are_weights_converted) + { + ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + _convert_weights->run(convert_pack); + + cur_weights->mark_as_unused(); + cur_weights = converted_weights.get(); + + _are_weights_converted = true; + } + + tensors.add_const_tensor(ACL_SRC_1, cur_weights); + + // Prepare GEMM prepare and release unused weights + if(!_is_quantized) + { + _mm_gemm->prepare(tensors); + } + else + { + _mm_gemmlowp->prepare(tensors); + } + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClFullyConnected::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h new file mode 100644 index 0000000000..dc5f9e5c9b --- /dev/null +++ b/src/gpu/cl/operators/ClFullyConnected.h @@ -0,0 +1,138 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_FULLY_CONNECTED_H +#define ARM_COMPUTE_CL_FULLY_CONNECTED_H + +#include "arm_compute/core/TensorInfo.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +// Forward declarations +class ClConvertFullyConnectedWeights; +class ClFlatten; +class ClGemm; +class ClGemmLowpMatrixMultiplyCore; +class ClTranspose; + +/** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer) + * -# @ref CLTranspose (if @p are_weights_reshaped is set to false and transpose_weights is set to true ) (called once) + * -# @ref opencl::kernels::ClGemmMatrixMultiplyKernel or @ref CLGEMMLowpMatrixMultiplyCore (if quantized asymmetric) + * + * @note The fully connected layer accepts "weights" tensors only with 2 dimensions. + */ +class ClFullyConnected : public IClOperator +{ +public: + ClFullyConnected(); + ~ClFullyConnected(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor. The weights must be 2 dimensional. + * If this function is called after a Convolution Layer, the (transposed) weights will have as many rows as the product of the first 3 input's dimensions. + * If it is called after another FullyConnected Layer, the (transposed) weights will have as many rows as the input's first dimension. + * Data type supported: Same as @p src. + * @param[in] biases Bias tensor. Can be nullptr. Data type supported:Same as @p src. + * @param[out] dst Destination tensor. Its shape should be equal to the output of a matrix multiplication between: + * - The output of im2col on the input and the (transposed) 2D weights, if the function is called after a Convolution Layer + * - The input tensor and the (transposed) 2D weights, if the function is called after another FullyConnected Layer. + * Data type supported: Same as @p src. + * @param[in] fc_info (Optional) Fully connected layer additional info + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClFullyConnected::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); + + // Inherited methods overriden + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + +private: + enum AuxTensorIdx + { + TransposedWeights = 10, + ConvertedWeights = 11, + FlattenedSrc = 12, + Count = 13 + }; + + std::unique_ptr<ClConvertFullyConnectedWeights> _convert_weights; + std::unique_ptr<ClFlatten> _flatten; + std::unique_ptr<ClTranspose> _reshape_weights; + std::unique_ptr<ClGemm> _mm_gemm; + std::unique_ptr<ClGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + + experimental::MemoryRequirements _aux_mem{}; + + TensorInfo _flattened_src{}; + TensorInfo _converted_weights{}; + TensorInfo _reshaped_weights{}; + + TensorInfo _weights_to_use{}; + int _weights_to_use_idx{ ACL_SRC_1 }; + + bool _are_weights_converted{ true }; + bool _are_weights_reshaped{ true }; + bool _is_fc_after_conv{ true }; + bool _is_quantized{ false }; + bool _is_prepared{ false }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_FULLY_CONNECTED_H */ diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp new file mode 100644 index 0000000000..625c057cf4 --- /dev/null +++ b/src/gpu/cl/operators/ClGemm.cpp @@ -0,0 +1,771 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemm.h" + +#include "arm_compute/core/CL/CLKernelLibrary.h" +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/GPUTarget.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Log.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "arm_compute/runtime/ITensorAllocator.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/utils/helpers/float_ops.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" +#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" + +#include "support/Cast.h" +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::cl_gemm; +using namespace arm_compute::experimental; +using namespace arm_compute::utils::cast; +using namespace arm_compute::opencl::kernels; + +namespace +{ +inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) +{ + switch(kernel_type) + { + case CLGEMMKernelType::NATIVE_V1: + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + case CLGEMMKernelType::RESHAPED_V1: + case CLGEMMKernelType::RESHAPED: + { + return true; + } + default: + { + return false; + } + } +} +//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type +inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) +{ + if(!constant_weights) + { + return CLGEMMKernelType::NATIVE_V1; + } + + auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); + if(bool(gemm_kernel)) + { + if(validate_gemm_kernel(gemm_kernel.gemm_type)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; + } + } + gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; +} +// Validate lhs_info and rhs_info for reshaped only rhs kernel +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, + const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo tmp_b_info{}; + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + gemm_kernel_info.has_pad_y = false; + if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + { + return false; + } + gemm_kernel_info.has_pad_y = true; + if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + { + return false; + } + return true; +} + +//Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, const ITensorInfo *output) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); + if(config) + { + if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; + } + } + config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; +} + +// Validate lhs_info and rhs_info for reshaped kernel +inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, + const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + + // Validate reshape LHS kernel + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); + if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) + { + return false; + } + + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + { + return false; + } + return true; +} + +//Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b, + const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query); + if(config) + { + if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; + } + } + config = auto_heuristics::select_default_gemm_config_reshaped(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; +} +} // namespace + +ClGemm::ClGemm() + : _mm_kernel(std::make_unique<ClGemmMatrixMultiplyKernel>()), + _reshape_lhs_kernel(std::make_unique<ClGemmReshapeLhsMatrixKernel>()), + _reshape_rhs_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()), + _mm_reshaped_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedKernel>()), + _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()), + _mm_reshaped_only_rhs_fallback_kernel(std::make_unique<ClGemmMatrixMultiplyReshapedOnlyRhsKernel>()), + _tmp_a(), + _tmp_b(), + _reshape_b_only_on_first_run(false), + _gemm_kernel_type(CLGEMMKernelType::NATIVE_V1), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) +{ +} + +void ClGemm::configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, + const GEMMInfo &gemm_info) +{ + const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const GPUTarget gpu_target = CLScheduler::get().target(); + + // Set the target for the kernels + _mm_kernel->set_target(gpu_target); + + GEMMReshapeInfo reshape_info(m, n, k, 1, 1, gemm_info.depth_output_gemm3d(), gemm_info.reinterpret_input_as_3d(), gemm_info.broadcast_bias()); + + // Configure and tune matrix multiply kernel + _mm_kernel->configure(compile_context, a, b, c, output, alpha, beta, false, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); + + // Tune kernel statically + CLScheduler::get().tune_kernel_static(*_mm_kernel); +} + +void ClGemm::configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, + const GEMMInfo &gemm_info) +{ + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + int mult_transpose1xW_width = 1; + int mult_interleave4x4_height = 1; + + // Set the target for the kernels + _reshape_lhs_kernel->set_target(gpu_target); + _mm_kernel->set_target(gpu_target); + + if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) + { + mult_transpose1xW_width = 4; + mult_interleave4x4_height = 2; + } + + GEMMRHSMatrixInfo rhs_info; + rhs_info.n0 = 16 / b->element_size(); + rhs_info.k0 = 1; + rhs_info.h0 = mult_transpose1xW_width; + rhs_info.interleave = false; + rhs_info.transpose = false; + + GEMMLHSMatrixInfo lhs_info; + lhs_info.m0 = 4; + lhs_info.k0 = 4; + lhs_info.v0 = mult_interleave4x4_height; + lhs_info.interleave = true; + lhs_info.transpose = true; + + GEMMReshapeInfo reshape_info(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); + + // Configure interleave kernel + _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, reinterpret_input_as_3d); + + // Configure transpose kernel + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); + + // Configure and tune matrix multiply kernel + _mm_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, true, reshape_info, gemm_info.fp_mixed_precision(), gemm_info.activation_info()); + + CLScheduler::get().tune_kernel_static(*_mm_kernel); + + // Request memory for LHS and RHS reshape matrix + _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); +} + +void ClGemm::configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, + const GEMMInfo &gemm_info) +{ + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + // Set the target for the kernels + _reshape_lhs_kernel->set_target(gpu_target); + _mm_kernel->set_target(gpu_target); + + GEMMLHSMatrixInfo lhs_info{}; + GEMMRHSMatrixInfo rhs_info{}; + + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, + c, output, gemm_info.reinterpret_input_as_3d()); + + _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); + + // Configure and tune matrix multiply kernel + _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + + // Request memory for LHS and RHS reshape matrix + _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); + _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); +} + +void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, + const GEMMInfo &gemm_info) +{ + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + // Set the target for the kernels + _mm_kernel->set_target(gpu_target); + + GEMMLHSMatrixInfo lhs_info{}; + GEMMRHSMatrixInfo rhs_info{}; + + // Pick up the GEMM configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output); + + // Transpose matrix + _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); + + // Configure two variants of CLGEMMMatrixMultiplyReshapedOnlyRHSKernel (has_pad_y = false/true) + // During the prepare stage we check the padding requirement for the lhs and dst tensors. If they do not have + // pad y, we dispatch CLGEMMMatrixMultiplyReshapedOnlyRHSKernel with has_pad_y = false + + // Configure matrix multiply kernel with no y padding support + kernel_info.has_pad_y = false; + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + + // Configure matrix multiply kernel with y padding support + kernel_info.has_pad_y = true; + _mm_reshaped_only_rhs_fallback_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + + // Request memory for RHS reshape matrix + _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); +} + +Status ClGemm::validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d, gemm_info.broadcast_bias()); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(a, b, c, output, alpha, beta, + false, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); + + return Status{}; +} + +Status ClGemm::validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + const unsigned int m = gemm_info.reinterpret_input_as_3d() ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + int mult_transpose1xW_width = 1; + int mult_interleave4x4_height = 1; + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + + if(get_arch_from_target(gpu_target) == GPUTarget::BIFROST) + { + mult_transpose1xW_width = 4; + mult_interleave4x4_height = 2; + } + + GEMMRHSMatrixInfo rhs_info; + rhs_info.n0 = 16 / b->element_size(); + rhs_info.k0 = 1; + rhs_info.h0 = mult_transpose1xW_width; + rhs_info.interleave = false; + rhs_info.transpose = false; + + GEMMLHSMatrixInfo lhs_info; + lhs_info.m0 = 4; + lhs_info.k0 = 4; + lhs_info.v0 = mult_interleave4x4_height; + lhs_info.interleave = true; + lhs_info.transpose = true; + + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, mult_transpose1xW_width, mult_interleave4x4_height, depth_output_gemm3d, false, gemm_info.broadcast_bias()); + + // Validate interleave kernel + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); + + // Validate transpose kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, + true, reshape_info, gpu_target, gemm_info.fp_mixed_precision(), gemm_info.activation_info())); + + return Status{}; +} + +Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + + TensorInfo tmp_a_info{}; + TensorInfo tmp_b_info{}; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = false; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; + + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); + + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + + return Status{}; +} + +Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_UNUSED(alpha); + ARM_COMPUTE_UNUSED(output); + + TensorInfo tmp_b_info{}; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + const DataType data_type = a->data_type(); + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); + + GEMMKernelInfo kernel_info; + kernel_info.m = m; + kernel_info.n = n; + kernel_info.k = k; + kernel_info.depth_output_gemm3d = depth_output_gemm3d; + kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + kernel_info.broadcast_bias = broadcast_bias; + kernel_info.activation_info = gemm_info.activation_info(); + + GEMMLHSMatrixInfo lhs_info; + GEMMRHSMatrixInfo rhs_info; + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; + + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); + + // Validate matrix multiply + kernel_info.has_pad_y = false; + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + + kernel_info.has_pad_y = true; + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + + return Status{}; +} + +void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + + // Perform validation step + ARM_COMPUTE_ERROR_THROW_ON(validate(a, b, c, output, alpha, beta, gemm_info)); + + // Check if we need to reshape the matrix B only on the first run + _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _is_prepared = gemm_info.retain_internal_weights(); + + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + + // Select GEMMType + _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run, + gemm_info.constant_weights()); + + const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); + + ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; + + switch(_gemm_kernel_type) + { + case CLGEMMKernelType::NATIVE_V1: + { + configure_native_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + case CLGEMMKernelType::RESHAPED_V1: + { + configure_reshaped_v1(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + case CLGEMMKernelType::RESHAPED: + { + configure_reshaped_v2(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + { + configure_reshaped_only_rhs(compile_context, a, b, c_to_use, output, alpha, beta, gemm_info); + break; + } + default: + { + ARM_COMPUTE_ERROR("GEMMType not supported"); + } + } +} + +Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +{ + // Get the GPU target + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + + // Select GEMMType + CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery + { + CLScheduler::get().target(), a->data_type(), m, n, k, batch_size, + }, + gemm_info.reshape_b_only_on_first_run(), gemm_info.constant_weights()); + + const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); + + const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; + + switch(gemm_kernel_type) + { + case CLGEMMKernelType::NATIVE_V1: + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_native_v1(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + case CLGEMMKernelType::RESHAPED_V1: + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_v1(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + case CLGEMMKernelType::RESHAPED: + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + { + ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs(a, b, c_to_use, output, alpha, beta, gemm_info)); + break; + } + default: + { + ARM_COMPUTE_RETURN_ERROR_MSG("GEMMType not supported"); + } + } + + return Status{}; +} + +void ClGemm::run(ITensorPack &tensors) +{ + const ITensor *lhs = tensors.get_const_tensor(ACL_SRC_0); + const ITensor *rhs = tensors.get_const_tensor(ACL_SRC_1); + const ITensor *src2 = tensors.get_const_tensor(ACL_SRC_2); + ITensor *dst = tensors.get_tensor(ACL_DST); + + ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, dst); + + CLAuxTensorHandler lhs_reshaped(offset_int_vec(LhsReshape), _tmp_a, tensors, true); + CLAuxTensorHandler rhs_reshaped(offset_int_vec(RhsReshape), _tmp_b, tensors, true); + + // Prepare the consts if needed + prepare(tensors); + + // Run matrix multiply kernel + switch(_gemm_kernel_type) + { + case CLGEMMKernelType::NATIVE_V1: + { + CLScheduler::get().enqueue_op(*_mm_kernel, tensors, true); + break; + } + case CLGEMMKernelType::RESHAPED_V1: + case CLGEMMKernelType::RESHAPED: + { + // Run interleave kernel + ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } }; + CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false); + + if(!_reshape_b_only_on_first_run) + { + // Run transpose kernel + ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); + } + + ITensorPack gemm_reshaped_pack{ { ACL_SRC_0, lhs_reshaped.get() }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } }; + + if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED) + { + CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true); + } + else + { + CLScheduler::get().enqueue_op(*_mm_kernel, gemm_reshaped_pack, true); + } + break; + } + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + { + if(!_reshape_b_only_on_first_run) + { + // Run transpose kernel + ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); + } + // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement + // Check if the lhs or dst tensors have padding + const unsigned int cross_plane_pad_lhs = lhs->info()->padding().top + lhs->info()->padding().bottom; + const unsigned int cross_plane_pad_dst = dst->info()->padding().top + dst->info()->padding().bottom; + bool has_pad_y = (cross_plane_pad_lhs != 0) || (cross_plane_pad_dst != 0); + + ITensorPack gemm_reshaped_onlyrhs_pack{ { ACL_SRC_0, lhs }, { ACL_SRC_1, rhs_reshaped.get() }, { ACL_SRC_2, src2 }, { ACL_DST, dst } }; + if(has_pad_y) + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_fallback_kernel, gemm_reshaped_onlyrhs_pack, true); + } + else + { + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_onlyrhs_pack, true); + } + break; + } + default: + { + ARM_COMPUTE_ERROR("GEMMType not supported"); + } + } +} + +void ClGemm::prepare(ITensorPack &constants) +{ + if(!_is_prepared) + { + const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); + ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape))); + + // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed + if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) + { + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!"); + + CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux); + ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr); + + ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } }; + CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true); + } + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClGemm::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h new file mode 100644 index 0000000000..60bb78c371 --- /dev/null +++ b/src/gpu/cl/operators/ClGemm.h @@ -0,0 +1,137 @@ +/* + * Copyright (c) 2016-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_H +#define ARM_COMPUTE_CL_GEMM_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/CL/CLTensor.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClKernel.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h" +#include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to execute GEMM on OpenCL. This function calls the following OpenCL kernels: + * + * -# @ref kernels::ClGemmReshapeLhsMatrixKernel (only if the RESHAPED_V1 is selected by the heuristic model) + * -# @ref kernels::ClGemmReshapeRhsMatrixKernel (only if either the RESHAPED_V1 or RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method()) + * -# @ref kernels::ClGemmMatrixMultiplyKernel (only if either the NATIVE or RESHAPED_V1 is selected by the select_gemm_kernel method()) + * -# @ref kernels::ClGemmMatrixMultiplyReshapedKernel (only if RESHAPED_V1 is selected by the select_gemm_kernel method()) + * -# @ref kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel (only if RESHAPED_ONLY_RHS is selected by the select_gemm_kernel method()) + */ +class ClGemm : public IClOperator +{ +public: + /** Constructor */ + ClGemm(); + /** Initialise the kernel's inputs and output + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:------------|:-----------|:---------|:--------------| + * |F32 |F32 |F32 |F32 | + * |F16 |F16 |F16 |F16 | + * + * @note GEMM: General Matrix Multiply - [alpha * A * B + beta * C]. + * + * @note All tensors must have the same data type. + * + * @note Whilst the first input tensor can be a vector, the second input tensor must be at least a matrix + * + * @param[in] compile_context The compile context to be used. + * @param[in] a First input tensor (Matrix or Vector A). Data types supported: F16/F32 + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a. + * @param[in] c Third input tensor (Matrix C). It can be a nullptr if just the multiplication between @p a and @p b is needed. Data type supported: same as @p a. + * @param[out] output Output tensor. Data type supported: same as @p a + * @param[in] alpha Weight of the matrix product + * @param[in] beta Weight of matrix C + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping + * in case matrix A and matrix B have been already transformed. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemm::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + void configure_native_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_reshaped_v1(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_reshaped_v2(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + + static Status validate_native_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate_reshaped_v1(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + +private: + enum AuxTensorIdx + { + LhsReshape = 0, + RhsReshape, + Count + }; + +private: + std::unique_ptr<kernels::ClGemmMatrixMultiplyKernel> _mm_kernel; + std::unique_ptr<kernels::ClGemmReshapeLhsMatrixKernel> _reshape_lhs_kernel; + std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _reshape_rhs_kernel; + std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedKernel> _mm_reshaped_kernel; + std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; + std::unique_ptr<kernels::ClGemmMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_fallback_kernel; + TensorInfo _tmp_a; + TensorInfo _tmp_b; + bool _reshape_b_only_on_first_run; + CLGEMMKernelType _gemm_kernel_type; + bool _is_prepared; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLGEMM_H */ diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp new file mode 100644 index 0000000000..0f625bc56a --- /dev/null +++ b/src/gpu/cl/operators/ClGemmConv2d.cpp @@ -0,0 +1,628 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemmConv2d.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/Size2D.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" +#include "src/gpu/cl/kernels/ClCol2ImKernel.h" +#include "src/gpu/cl/kernels/ClIm2ColKernel.h" +#include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h" +#include "src/gpu/cl/operators/ClGemm.h" +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "support/Cast.h" + +namespace arm_compute +{ +using namespace experimental; +using namespace misc::shape_calculator; +using namespace utils::cast; +namespace opencl +{ +ClGemmConv2d::ClGemmConv2d() + : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(), + _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) +{ +} +ClGemmConv2d::~ClGemmConv2d() = default; + +void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, const ActivationLayerInfo &act_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); + ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + gemm_3d_depth, // depth_output_gemm3d + _skip_im2col, // reinterpret_input_as_3d + false, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fast_math + false, // fp_mixed_precision + true, // broadcast_bias + act_info); // activation_info + + TensorInfo tmp_src{ *src }; + if(_is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = src->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->quantization_info(); + + tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); + _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info); + + // Revert back QuantizatioInfo as weights could be used in other convolution layers + weights->set_quantization_info(weights_quantization_info); + + auto mm_mem_req = _mm_gemmlowp->workspace(); + for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } + else + { + // Configure matrix multiply function + _mm_gemm = std::make_unique<ClGemm>(); + _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + auto mm_mem_req = _mm_gemm->workspace(); + for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + { + _aux_mem[cont] = mm_mem_req[cont]; + } + } +} + +Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info) +{ + const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + + const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped + false, // is_b_reshaped + true, // reshape_b_only_on_first_run + gemm_3d_depth, // depth_output_gemm3d + skip_im2col, // reinterpret_input_as_3d + false, // retain_internal_weights + gemmlowp_output_stage, // gemmlowp_output_stage + false, // fast_math + false, // fp_mixed_precision + true, // broadcast_bias + act_info); // activation_info + + if(is_quantized) + { + // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() + // Extract and negate input and weights offset + const QuantizationInfo input_quantization_info = src->quantization_info(); + const QuantizationInfo weights_quantization_info = weights->quantization_info(); + + std::unique_ptr<ITensorInfo> src_qa = src->clone(); + std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); + src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + + // Perform validation step on GEMMLowp + return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info); + } + else + { + // Perform validation step on Matrix multiply function + return ClGemm::validate(src, weights, biases, dst, 1.0f, 1.0f, gemm_info); + } +} + +void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); + + ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, + conv2d_info, + weights_info)); + + const DataType data_type = src->data_type(); + const DataLayout data_layout = src->data_layout(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + const unsigned int num_kernels = weights->dimension(idx_kernels); + + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + + _is_prepared = weights_info.retain_internal_weights(); + _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + _skip_col2im = data_layout == DataLayout::NHWC; + + // Only for quantize there are few cases where we cannot fuse the activation function in GEMM + _fuse_activation = true; + + const ITensorInfo *gemm_input_to_use = src; + ITensorInfo *gemm_output_to_use = dst; + + // Get parameters from conv_info + unsigned int stride_x = 0; + unsigned int stride_y = 0; + std::tie(stride_x, stride_y) = conv2d_info.conv_info.stride(); + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), + src->dimension(idx_height), + kernel_width, + kernel_height, + conv2d_info.conv_info, + conv2d_info.dilation); + + unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; + + ITensorInfo *biases_to_use = biases; + _append_bias = false; + + _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>(); + if(conv2d_info.num_groups != 1 && biases != nullptr) + { + // num_groups != 1 can only be for NCHW + // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor + biases_to_use = nullptr; + _append_bias = true; + _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups); + } + else + { + _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups); + } + + // Create tensor to store im2col reshaped inputs + if(!_skip_im2col) + { + // Configure and tune im2col. im2col output shape is auto-initialized + _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>(); + + // Set the GPU target for im2col + _im2col_kernel->set_target(CLScheduler::get().target()); + _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); + + // Set quantization info + _im2col_output.set_quantization_info(src->quantization_info()); + CLScheduler::get().tune_kernel_static(*_im2col_kernel); + + // Update GEMM input + gemm_input_to_use = &_im2col_output; + } + + // Create GEMM output tensor + if(!_skip_col2im) + { + TensorShape shape_gemm; + + // If we cannot skip col2im it means we run im2col as well + shape_gemm = _im2col_output.tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + + _gemm_output = TensorInfo(shape_gemm, 1, data_type); + _gemm_output.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + + // Update GEMM output + gemm_output_to_use = &_gemm_output; + } + + GEMMLowpOutputStageInfo gemmlowp_output_stage; + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + + // Configure output stage for quantized case + if(_is_quantized) + { + const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info; + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); + const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; + + gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; + + gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); + gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); + quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, + gemmlowp_output_stage.gemmlowp_multipliers.data(), + gemmlowp_output_stage.gemmlowp_shifts.data()); + gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; + gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; + + PixelValue min_val{}; + PixelValue max_val{}; + std::tie(min_val, max_val) = get_min_max(dst->data_type()); + + auto min_activation = min_val.get<int32_t>(); + auto max_activation = max_val.get<int32_t>(); + + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU + }; + + if(conv2d_info.act_info.enabled()) + { + if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + } + else + { + _fuse_activation = false; + } + } + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_min_bound = min_activation; + gemmlowp_output_stage.gemmlowp_max_bound = max_activation; + } + + // Configure and tune GEMM + // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix + const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; + + configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); + + if(!_skip_col2im) + { + // Set the GPU target for col2im + _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>(); + _col2im_kernel->set_target(CLScheduler::get().target()); + // Configure and tune Col2Im + _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups); + CLScheduler::get().tune_kernel_static(*_col2im_kernel.get()); + } + + ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), + "Output shape does not match the expected one"); + + if(!_fuse_activation) + { + _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>(); + _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info); + } + + _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); +} + +Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); + + if(!is_quantized_per_channel) + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); + } + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW)); + + const DataLayout data_layout = src->data_layout(); + const DataType data_type = src->data_type(); + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); + const int idx_kernels = get_data_layout_dimension_index(data_layout, DataLayoutDimension::BATCHES); + + const unsigned int kernel_width = weights->dimension(idx_width); + const unsigned int kernel_height = weights->dimension(idx_height); + const unsigned int num_kernels = weights->dimension(idx_kernels); + + TensorInfo im2col_reshaped_info{}; + TensorInfo info_gemm{}; + TensorInfo weights_reshaped_info{}; + const ITensorInfo *gemm_input_to_use = src; + const ITensorInfo *gemm_output_to_use = dst; + const ITensorInfo *weights_to_use = weights; + const bool is_quantized = is_data_type_quantized_asymmetric(data_type); + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 + && conv2d_info.conv_info.stride().second == 1); + const bool skip_col2im = data_layout == DataLayout::NHWC; + bool fuse_activation = true; + + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); + + // Validate biases + if(biases != nullptr) + { + if(is_quantized) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); + } + else + { + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, biases); + } + ARM_COMPUTE_RETURN_ERROR_ON(biases->dimension(0) != weights->dimension(idx_kernels)); + ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); + } + + if(conv2d_info.act_info.enabled()) + { + ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a()); + } + + // Get convolved dimensions + unsigned int conv_w = 0; + unsigned int conv_h = 0; + + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), + src->dimension(idx_height), + kernel_width, + kernel_height, + conv2d_info.conv_info, + conv2d_info.dilation); + + unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; + + const ITensorInfo *biases_to_use = biases; + bool append_bias = false; + + if(conv2d_info.num_groups != 1 && biases != nullptr) + { + // num_groups != 1 can only be for NCHW + // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor + biases_to_use = nullptr; + append_bias = true; + weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); + } + else + { + weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); + } + + weights_to_use = &weights_reshaped_info; + + if(!skip_im2col) + { + const Size2D kernel_dims(kernel_width, kernel_height); + + // Output tensor auto initialization if not yet initialized + TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups); + + auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape)); + + ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups)); + gemm_input_to_use = &im2col_reshaped_info; + } + + // Create GEMM output tensor + if(!skip_col2im) + { + TensorShape shape_gemm; + + shape_gemm = gemm_input_to_use->tensor_shape(); + shape_gemm.set(0, mat_weights_cols); + shape_gemm.set(1, conv_w * conv_h); + + info_gemm = TensorInfo(shape_gemm, 1, data_type); + info_gemm.set_quantization_info(dst->quantization_info()).set_data_layout(src->data_layout()); + gemm_output_to_use = &info_gemm; + } + + GEMMLowpOutputStageInfo gemmlowp_output_stage; + gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; + gemmlowp_output_stage.gemmlowp_offset = 0; + gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; + + if(is_quantized) + { + const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); + const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); + const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info; + const unsigned int num_filters = (is_quantized_per_channel) ? num_kernels : 1; + + gemmlowp_output_stage.gemmlowp_multipliers.resize(num_filters); + gemmlowp_output_stage.gemmlowp_shifts.resize(num_filters); + quantization::compute_quantized_multipliers_and_shifts(src, weights, dst, + gemmlowp_output_stage.gemmlowp_multipliers.data(), + gemmlowp_output_stage.gemmlowp_shifts.data()); + gemmlowp_output_stage.gemmlowp_multiplier = gemmlowp_output_stage.gemmlowp_multipliers[0]; + gemmlowp_output_stage.gemmlowp_shift = gemmlowp_output_stage.gemmlowp_shifts[0]; + + int min_activation = 0; + int max_activation = 0; + + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU + }; + + if(conv2d_info.act_info.enabled()) + { + if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + { + std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + } + else + { + fuse_activation = false; + } + } + + // Set the GEMMLowp output stage info + gemmlowp_output_stage.gemmlowp_offset = output_quant_info.offset; + gemmlowp_output_stage.gemmlowp_min_bound = min_activation; + gemmlowp_output_stage.gemmlowp_max_bound = max_activation; + } + + // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix + const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; + + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); + + // Validate Col2Im + if(!skip_col2im) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); + } + + //Validate Activation Layer + if(!fuse_activation) + { + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info)); + } + + return Status{}; +} + +void ClGemmConv2d::run(ITensorPack &tensors) +{ + prepare(tensors); + + auto src = tensors.get_const_tensor(ACL_SRC_0); + auto biases = tensors.get_const_tensor(ACL_SRC_2); + auto dst = tensors.get_tensor(ACL_DST); + auto gemm_input_to_use = src; + auto gemm_output_to_use = dst; + + CLAuxTensorHandler im2col_output(offset_int_vec(Im2ColOutput), _im2col_output, tensors, false); + CLAuxTensorHandler gemm_output(offset_int_vec(GemmOutput), _gemm_output, tensors, false); + CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); + + // Run im2col + if(!_skip_im2col) + { + ITensorPack pack = + { + { TensorType::ACL_SRC, src }, + { TensorType::ACL_DST, im2col_output.get() } + }; + CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false); + gemm_input_to_use = im2col_output.get(); + } + if(!_skip_col2im) + { + gemm_output_to_use = gemm_output.get(); + } + ITensorPack pack_mm = tensors; + pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); + pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); + if(!_append_bias) + { + pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases); + } + pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); + // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions + if(_is_quantized) + { + // Run gemmlowp + _mm_gemmlowp->run(pack_mm); + } + else + { + // Run gemm + _mm_gemm->run(pack_mm); + } + + // Reshape output matrix + if(!_skip_col2im) + { + ITensorPack pack = + { + { TensorType::ACL_SRC, gemm_output_to_use }, + { TensorType::ACL_DST, dst } + }; + CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false); + } + + //Run Activation Layer if we cannot fuse in GEMM + if(!_fuse_activation) + { + ITensorPack pack = + { + { TensorType::ACL_SRC, dst }, + { TensorType::ACL_DST, dst } + }; + CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false); + } +} + +void ClGemmConv2d::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + // Run weights reshaping and mark original weights tensor as unused + ICLTensor *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped))); + CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p); + auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); + ITensorPack pack = + { + { TensorType::ACL_SRC, weights }, + { TensorType::ACL_DST, weights_reshaped.get() } + }; + + if(_append_bias) + { + const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); + pack.add_const_tensor(TensorType::ACL_BIAS, biases); + } + CLScheduler::get().enqueue_op(*_weights_reshape_kernel.get(), pack, true); + tensors.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); + + // Prepare GEMM + _is_quantized ? _mm_gemmlowp->prepare(tensors) : _mm_gemm->prepare(tensors); + _is_prepared = true; + } +} +experimental::MemoryRequirements ClGemmConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h new file mode 100644 index 0000000000..9a5e381dd7 --- /dev/null +++ b/src/gpu/cl/operators/ClGemmConv2d.h @@ -0,0 +1,185 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMM_CONV2D_H +#define ARM_COMPUTE_CL_GEMM_CONV2D_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/FunctionDescriptors.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +class ClGemm; +class ClGemmLowpMatrixMultiplyCore; +namespace kernels +{ +class ClIm2ColKernel; +class ClCol2ImKernel; +class ClWeightsReshapeKernel; +class ClActivationKernel; +} // namespace kernels + +/** Basic function to compute the convolution layer. This function calls the following OpenCL kernels/functions: + * + * -# @ref opencl::kernels::ClIm2ColKernel + * -# @ref ClGemm (if the data type is FP32 or FP16) + * -# @ref CLGEMMLowpMatrixMultiplyCore (if the data type is QASYMM8/QASYMM8_SIGNED) + * -# @ref ClGemmLowpOutputStage with QUANTIZE_DOWN_FIXEDPOINT type of quantization (if the data type is QASYMM8/QASYMM8_SIGNED) + * -# @ref opencl::kernels::ClCol2ImKernel (if NCHW data layout) + * -# @ref opencl::kernels::ClActivationKernel + */ +class ClGemmConv2d : public IClOperator +{ +public: + /** Constructor */ + ClGemmConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClGemmConv2d(const ClGemmConv2d &) = delete; + /** Default move constructor */ + ClGemmConv2d(ClGemmConv2d &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClGemmConv2d &operator=(const ClGemmConv2d &) = delete; + /** Default move assignment operator */ + ClGemmConv2d &operator=(ClGemmConv2d &&) = default; + /**Default destructor */ + ~ClGemmConv2d(); + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. + * Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type. + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p input. + * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. + * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights + * tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemmConvolution::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + /** Configures the appropriate matrix multiply routine + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or + * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type. + * @param[in, out] dst Output tensor info. Data types supported: same as @p input. + * @param[in] gemmlowp_output_stage GEMMLowp output stage info + * @param[in] gemm_3d_depth Depth of GEMM 3D + * @param[in] act_info Activation to apply after the matrix multiplication + */ + void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, const ActivationLayerInfo &act_info); + /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines + * + * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] weights Weights tensor info. Data type supported: Same as @p input or QASYMM8/QSYMM8_PER_CHANNEL when @p input is QASYMM8 or + * QASYMM8_SIGNED/QSYMM8_PER_CHANNEL when @p input is QASYMM8_SIGNED. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM]. + * Data type supported: Should match @p input data type, except for input of quantized type where biases should be of S32 type. + * @param[in] dst Output tensor info. Data types supported: same as @p input. + * @param[in] gemmlowp_output_stage GEMMLowp output stage info + * @param[in] gemm_3d_depth Depth of GEMM 3D + * @param[in] skip_im2col Flag which specifies if im2col has to be skipped. i.e. 1x1 convolution with NHWC data layout. + * @param[in] act_info Activation to apply after the matrix multiplication + * + * @return a status + */ + static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info); + + enum AuxTensorIdx + { + // ClGemmLowpMatrixMultiplyCore has up to 7 internal tensors + Im2ColOutput = 8, + WeightsReshaped, + GemmOutput, + Count + }; + + std::unique_ptr<kernels::ClWeightsReshapeKernel> _weights_reshape_kernel; + std::unique_ptr<kernels::ClIm2ColKernel> _im2col_kernel; + std::unique_ptr<ClGemm> _mm_gemm; + std::unique_ptr<ClGemmLowpMatrixMultiplyCore> _mm_gemmlowp; + std::unique_ptr<opencl::kernels::ClCol2ImKernel> _col2im_kernel; + std::unique_ptr<kernels::ClActivationKernel> _activation_kernel; + + TensorInfo _im2col_output; + TensorInfo _weights_reshaped; + TensorInfo _gemm_output; + + bool _skip_im2col; + bool _skip_col2im; + bool _is_quantized; + bool _fuse_activation; + bool _append_bias; + bool _is_prepared; + + experimental::MemoryRequirements _aux_mem; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMM_CONV2D_H */ diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp new file mode 100644 index 0000000000..f3c0ee1c8f --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp @@ -0,0 +1,786 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/Helpers.h" +#include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/Log.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/helpers/AutoConfiguration.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClCastKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpReductionKernel.h" +#include "src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" + +#include "utils/TypePrinter.h" + +namespace arm_compute +{ +namespace opencl +{ +using namespace arm_compute::misc::shape_calculator; +using namespace arm_compute::cl_gemm; +using namespace arm_compute::opencl::kernels; +using namespace arm_compute::experimental; + +namespace +{ +inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) +{ + switch(kernel_type) + { + case CLGEMMKernelType::NATIVE: + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + { + return true; + } + default: + { + return false; + } + } +} + +//Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type +inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run) +{ + auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); + if(bool(gemm_kernel)) + { + if(validate_gemm_kernel(gemm_kernel.gemm_type)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; + } + } + gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + return gemm_kernel.gemm_type; +} + +// Validate lhs_info and rhs_info for native kernel +inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo mm_result_s32_info{}; + // Output tensor auto initialization if not yet initialized + auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); + // Validate mm kernel + // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info + // NOTE: This assumes: + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). + if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) + { + return false; + } + return true; +} + +// Automatically select between mlgo (prioritized) and default heuristics for native kernel configs +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_native(query); + if(config) + { + if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; + } + } + config = auto_heuristics::select_default_gemm_config_native(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; +} + +// Validate lhs_info and rhs_info for reshaped only rhs kernel +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, + unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +{ + // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel + TensorInfo tmp_b_info{}; + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); + if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + { + return false; + } + // Validate mm kernel + // NOTE: Ignore all other parameters (eg. depth_output_gemm3d, output stage etc.) and only validate lhs and rhs info + // NOTE: This assumes: + // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_arguments). + // 2. lhs and rhs info does not cause window and padding issues through side effects (in ClGemmLowpMatrixMultiplyReshapedOnlyRHSKernel.cpp validate_and_configure_window). + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + // Since we ignore the output stage, output data type has to be S32 to pass the validation + TensorInfo output_info_copy(*output); + output_info_copy.set_data_type(DataType::S32); + if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + { + return false; + } + return true; +} + +// Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, const ITensorInfo *output) +{ + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); + if(config) + { + if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d)) + { + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; + } + } + config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return { config.lhs_info, config.rhs_info }; +} + +inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) +{ + switch(kernel_type) + { + case CLGEMMKernelType::NATIVE: + return false; + case CLGEMMKernelType::RESHAPED_ONLY_RHS: + return true; + default: + ARM_COMPUTE_ERROR("Not supported gemmlowp kernel!"); + } +} +} // namespace + +ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore() + : _weights_to_qasymm8(std::make_unique<ClCastKernel>()), + _mm_native_kernel(std::make_unique<ClGemmLowpMatrixMultiplyNativeKernel>()), + _mm_reshaped_only_rhs_kernel(std::make_unique<ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel>()), + _mtx_b_reshape_kernel(std::make_unique<ClGemmReshapeRhsMatrixKernel>()), + _mtx_a_reduction_kernel(std::make_unique<ClGemmLowpMatrixAReductionKernel>()), + _mtx_b_reduction_kernel(std::make_unique<ClGemmLowpMatrixBReductionKernel>()), + _offset_contribution_kernel(std::make_unique<ClGemmLowpOffsetContributionKernel>()), + _offset_contribution_output_stage_kernel(std::make_unique<ClGemmLowpOffsetContributionOutputStageKernel>()), + _aux_mem(AuxTensorIdx::Count) +{ +} + +ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default; + +void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, + ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, + const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c != nullptr ? c : nullptr, output, gemm_info)); + + _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); + _a_offset = a->quantization_info().uniform().offset; + _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) + && a->data_type() == DataType::QASYMM8; + _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset; + _gemm_info = gemm_info; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + + // Set the target for the kernels + _mm_native_kernel->set_target(gpu_target); + _mm_reshaped_only_rhs_kernel->set_target(gpu_target); + + GEMMRHSMatrixInfo rhs_info; + GEMMLHSMatrixInfo lhs_info; + + // Arguments used by GEMMReshapeInfo + // If we pass the matrix A and matrix B reshaped to CLGEMMMatrixMultiplyKernel, we need to pass m, n, k, mult_transpose1xW_width and mult_interleave4x4_height to CLGEMMReshapeInfo + // in order to know how the matrices have been reshaped + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + + const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); + + // Check if we need to reshape the matrix A and matrix B + _is_gemm_reshaped = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run)); + + if(_convert_to_qasymm8) + { + // Set data type for converted weights + _qasymm8_weights = *b; + _qasymm8_weights.set_data_type(DataType::QASYMM8); + _weights_to_qasymm8->configure(compile_context, b, &_qasymm8_weights, ConvertPolicy::WRAP); + } + + ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; + if(_is_gemm_reshaped) + { + matrix_b = &_tmp_b; + + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, + depth_output_gemm3d, + a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + + // Configure reshape RHS kernel + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + } + + // Using default reduction info + const GEMMLowpReductionKernelInfo reduction_info {}; + + // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0) + { + _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); + + // Configure Matrix B reduction kernel + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + } + + // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 + if(_b_offset != 0) + { + _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + _mtx_a_reduction_kernel->configure(compile_context, a, &_vector_sum_row, reduction_info); + } + + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + gemm_kernel_info.a_offset = _a_offset; + gemm_kernel_info.b_offset = _b_offset; + // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage + if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + // Configure offset contribution kernel + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + + _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32); + _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32); + + GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); + gemmlowp_output_stage.output_data_type = a->data_type(); + if(num_filters == 1) + { + // Per-channel quantization with OFM == 1 is equivalent to uniform quantization. + // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts + gemmlowp_output_stage.is_quantized_per_channel = false; + } + + gemm_kernel_info.output_stage = gemmlowp_output_stage; + + if(_is_gemm_reshaped && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + // Configure and tune matrix multiply kernel with fused output stage + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + } + else + { + _run_output_stage = true; + + if(_is_gemm_reshaped) + { + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + } + else + { + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, + a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); + + // Configure matrix multiply kernel + _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info); + + _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, + c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + } + } + } + else + { + _run_offset_contribution = true; + if(_is_gemm_reshaped) + { + // Configure and tune matrix multiply kernel + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); + } + else + { + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, + a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); + + // Configure matrix multiply kernel + _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info); + } + + // Configure offset contribution kernel + _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, + c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset); + } + + // Request memory + _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); + if(_is_gemm_reshaped) + { + // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation + _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); + _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + } + if(_a_offset != 0) + { + _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size()); + } + if(_b_offset != 0) + { + _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + } + _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size()); + _aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); +} + +Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_b_reshaped(), "Matrix B already reshaped is not supported"); + + int32_t a_offset = a->quantization_info().uniform().offset; + int32_t b_offset = b->quantization_info().uniform().offset; + + const ITensorInfo *matrix_a_info = a; + + TensorInfo tmp_b_info{}; + GEMMRHSMatrixInfo rhs_info; + GEMMLHSMatrixInfo lhs_info; + + // Get the GPU target + const GPUTarget gpu_target = CLScheduler::get().target(); + + bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + + bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run())); + + const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); + + bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) + && is_data_type_quantized_asymmetric(a->data_type()); + TensorInfo weights_info(*b); + if(convert_to_qasymm8) + { + b_offset = -128; + weights_info.set_data_type(DataType::QASYMM8); + ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); + } + const ITensorInfo *matrix_b_info = &weights_info; + if(reshape_matrix_b) + { + matrix_b_info = &tmp_b_info; + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; + + // Validate reshape RHS kernel + auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); + } + + TensorInfo info_vector_sum_col{}; + TensorInfo info_vector_sum_row{}; + + const GEMMLowpReductionKernelInfo reduction_info; + // Validate matrix B reduction kernel only if _a_offset is not equal to 0 + if(a_offset != 0) + { + info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); + + // Configure Matrix B reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); + } + + // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 + if(b_offset != 0) + { + info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); + + // Configure matrix A reduction kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); + } + + GEMMKernelInfo gemm_kernel_info; + gemm_kernel_info.m = m; + gemm_kernel_info.n = n; + gemm_kernel_info.k = k; + gemm_kernel_info.depth_output_gemm3d = depth_output_gemm3d; + gemm_kernel_info.reinterpret_input_as_3d = reinterpret_input_as_3d; + gemm_kernel_info.lhs_info = lhs_info; + gemm_kernel_info.rhs_info = rhs_info; + gemm_kernel_info.a_offset = a_offset; + gemm_kernel_info.b_offset = b_offset; + if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + { + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + + const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + + GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); + gemmlowp_output_stage.output_data_type = a->data_type(); + + gemm_kernel_info.output_stage = gemmlowp_output_stage; + if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + { + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, + a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, + c, + &gemm_output_stage_multipliers_shifts_info, + &gemm_output_stage_multipliers_shifts_info)); + } + else + { + TensorInfo mm_result_s32_info{}; + + if(reshape_matrix_b) + { + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); + } + else + { + // Output tensor auto inizialitation if not yet initialized + auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); + + // Pick up the GEMM configuration + // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + } + + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, + a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, + c, + output, + a_offset, b_offset, + gemmlowp_output_stage, + &gemm_output_stage_multipliers_shifts_info, + &gemm_output_stage_multipliers_shifts_info)); + } + } + else + { + if(reshape_matrix_b) + { + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); + } + else + { + // Pick up the GEMM configuration + // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration + const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; + + // Validate matrix multiply + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + } + + if(output->total_size() != 0) + { + // Validate offset contribution kernel + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output, + a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, + c, + a_offset, b_offset)); + } + } + + return Status{}; +} + +void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) +{ + const ITensor *a = tensors.get_const_tensor(ACL_SRC_0); + const ITensor *b = tensors.get_const_tensor(ACL_SRC_1); + const ITensor *c = tensors.get_const_tensor(ACL_SRC_2); + ITensor *dst = tensors.get_tensor(ACL_DST); + + ARM_COMPUTE_ERROR_ON_NULLPTR(a, dst); + + CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true); + CLAuxTensorHandler vec_sum_row(offset_int_vec(VecSumRow), _vector_sum_row, tensors, true); + CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, true); + CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true); + CLAuxTensorHandler res32(offset_int_vec(ResultS32), _mm_result_s32, tensors, true); + CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, true); + CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, true); + + // Prepare the consts if needed + prepare(tensors); + + const ITensor *matrix_a = a; + const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b; + + if(_is_gemm_reshaped) + { + matrix_b = tmp_b.get(); + if(!_reshape_b_only_on_first_run) + { + // Run reshape matrix B + ITensorPack mtx_b_reshape_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, + { TensorType::ACL_DST, tmp_b.get() } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false); + } + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && !_reshape_b_only_on_first_run) + { + ITensorPack mtx_b_red_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, + { TensorType::ACL_DST, vec_sum_col.get() } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); + } + + // Run matrix A reduction kernel only if _b_offset is not equal to 0 + if(_b_offset != 0) + { + ITensorPack mtx_a_red_pack = + { + { TensorType::ACL_SRC, matrix_a }, + { TensorType::ACL_DST, vec_sum_row.get() } + }; + CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false); + } + + // Run matrix multiply + if(_is_gemm_reshaped) + { + ITensorPack gemm_reshaped_pack; + if(_run_offset_contribution) + { + gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a }, + { TensorType::ACL_SRC_1, matrix_b }, + { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst } + }); + } + else + { + gemm_reshaped_pack = ITensorPack( + { + { TensorType::ACL_SRC, matrix_a }, + { TensorType::ACL_SRC_1, matrix_b }, + { TensorType::ACL_BIAS, c }, + { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, + { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, + { TensorType::ACL_SHIFTS, shifts.get() }, + { TensorType::ACL_MULTIPLIERS, multipliers.get() }, + { TensorType::ACL_DST, dst }, + }); + } + CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); + } + else + { + ITensorPack gemm_native_pack = + { + { TensorType::ACL_SRC_0, matrix_a }, + { TensorType::ACL_SRC_1, matrix_b }, + { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() } + }; + CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false); + } + if(_run_output_stage) + { + // Run offset contribution/output stage kernel + ITensorPack output_stage_pack = + { + { TensorType::ACL_SRC, res32.get() }, + { TensorType::ACL_BIAS, c }, + { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, + { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, + { TensorType::ACL_SHIFTS, shifts.get() }, + { TensorType::ACL_MULTIPLIERS, multipliers.get() }, + { TensorType::ACL_DST, dst }, + }; + CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true); + } + if(_run_offset_contribution) + { + // Run offset contribution kernel + ITensorPack offset_contrib_pack = + { + { TensorType::ACL_SRC_DST, dst }, + { TensorType::ACL_BIAS, c }, + { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, + { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() } + }; + CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true); + } +} + +void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); + CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true); + CLAuxTensorHandler vec_sum_col(offset_int_vec(VecSumCol), _vector_sum_col, tensors, true); + CLAuxTensorHandler rhs_qasymm8(offset_int_vec(RhsQAsymm8), _qasymm8_weights, tensors, false); + + ARM_COMPUTE_ERROR_ON_NULLPTR(b); + + if(_convert_to_qasymm8) + { + ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } }; + CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false); + b->mark_as_unused(); + } + + if(_is_gemm_reshaped && _reshape_b_only_on_first_run) + { + // Run reshape kernel and mark original weights tensor as unused + ITensorPack mtx_b_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, + { TensorType::ACL_DST, tmp_b.get() } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); + b->mark_as_unused(); + } + + // Run matrix B reduction kernel only if _a_offset is not equal to 0 + if(_a_offset != 0 && _reshape_b_only_on_first_run) + { + ITensorPack mtx_b_red_pack = + { + { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, + { TensorType::ACL_DST, vec_sum_col.get() } + }; + CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); + } + + // Compute GEMM output multipliers and shifts for output stage + { + const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + + CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false); + CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false); + + ICLTensor *multiplier_tensor = multipliers.get(); + if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) + { + multiplier_tensor->map(CLScheduler::get().queue(), true); + std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t)); + multiplier_tensor->unmap(CLScheduler::get().queue()); + } + + ICLTensor *shifts_tensor = shifts.get(); + if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) + { + shifts_tensor->map(CLScheduler::get().queue(), true); + std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); + shifts_tensor->unmap(CLScheduler::get().queue()); + } + } + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClGemmLowpMatrixMultiplyCore::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h new file mode 100644 index 0000000000..1965e3f97b --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h @@ -0,0 +1,155 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H +#define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H + +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/CL/CLTypes.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +namespace kernels +{ +// Forward declarations +class ClCastKernel; +class ClGemmLowpMatrixMultiplyNativeKernel; +class ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel; +class ClGemmReshapeRhsMatrixKernel; +class ClGemmLowpMatrixAReductionKernel; +class ClGemmLowpMatrixBReductionKernel; +class ClGemmLowpOffsetContributionKernel; +class ClGemmLowpOffsetContributionOutputStageKernel; +} // namespace kernels + +/** Basic function to execute GEMMLowpMatrixMultiplyCore on OpenCL. */ +class ClGemmLowpMatrixMultiplyCore : public IClOperator +{ +public: + ClGemmLowpMatrixMultiplyCore(); + ~ClGemmLowpMatrixMultiplyCore(); + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:------------------|:--------|:--------------| + * |QASYMM8 |QASYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |QASYMM8 | + * |QASYMM8 |QSYMM8 |S32 |QASYMM8 | + * |QASYMM8 |QASYMM8 |S32 |S32 | + * |QASYMM8 |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8 |QSYMM8 |S32 |S32 | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QSYMM8 |S32 |QASYMM8_SIGNED | + * |QASYMM8_SIGNED |QASYMM8_SIGNED |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8_PER_CHANNEL |S32 |S32 | + * |QASYMM8_SIGNED |QSYMM8 |S32 |S32 | + * + * @note GEMMLowp: low precision GEMM kernel. [A * B + C] + * This kernel performs the following computations: + * + * -# Convert a values from 8-bit quantized to int32 and add a_offset to each of them. + * -# Convert b values from 8-bit quantized to int32 and add b_offset to each of them. + * -# Compute the matrix product of the resulting a * b in int32. + * -# Quantize to uint8 if gemm_info.gemmlowp_output_stage != NONE + * + * @param[in] compile_context The compile context to be used. + * @param[in] a First input tensor (Matrix A). Data type supported: QASYMM8/QASYMM8_SIGNED. + * @param[in] b Second input tensor (Matrix B). Data type supported: same as @p a + * @param[in] c Third input tensor (Matrix C). It can be a nullptr. Data type supported: S32 + * @param[out] output Output tensor. Data type supported: S32 or QASYMM8/QASYMM8_SIGNED if gemm_info.gemmlowp_output_stage != NONE + * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and + * if the reshape of matrix B should be executed only for the first run + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemmLowpMatrixMultiplyCore::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum AuxTensorIdx + { + ResultS32 = 0, + RhsQAsymm8, + RhsReshape, + VecSumCol, + VecSumRow, + Multipliers, + Shifts, + Count + }; + +private: + // Kernels used + std::unique_ptr<kernels::ClCastKernel> _weights_to_qasymm8; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyNativeKernel> _mm_native_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel> _mm_reshaped_only_rhs_kernel; + std::unique_ptr<kernels::ClGemmReshapeRhsMatrixKernel> _mtx_b_reshape_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; + std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + + // Temporary tensors + TensorInfo _qasymm8_weights{}; + TensorInfo _vector_sum_col{}; + TensorInfo _vector_sum_row{}; + TensorInfo _tmp_b{}; + TensorInfo _mm_result_s32{}; + TensorInfo _gemm_output_stage_multipliers{}; + TensorInfo _gemm_output_stage_shifts{}; + + int32_t _a_offset{ 0 }; + int32_t _b_offset{ 0 }; + bool _is_gemm_reshaped{ true }; + bool _reshape_b_only_on_first_run{ false }; + bool _run_output_stage{ false }; + bool _convert_to_qasymm8{ false }; + bool _run_offset_contribution{ false }; + bool _is_prepared{ false }; + GEMMInfo _gemm_info{}; + + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_CORE_H */
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp new file mode 100644 index 0000000000..27fb89217c --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp @@ -0,0 +1,98 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClGemmLowpOutputStage.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Types.h" +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" +#include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); + + switch(info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + { + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel>(); + k->configure(compile_context, src, bias, dst, &info); + _kernel = std::move(k); + break; + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + { + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel>(); + k->configure(compile_context, src, bias, dst, &info); + _kernel = std::move(k); + break; + } + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: + { + auto k = std::make_unique<opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel>(); + k->configure(compile_context, src, bias, dst, &info); + _kernel = std::move(k); + break; + } + default: + ARM_COMPUTE_ERROR("Unsupported GEMMLowpOutputStage type."); + } +} + +Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); + + switch(info.type) + { + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info); + case GEMMLowpOutputStageType::QUANTIZE_DOWN: + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel::validate(src, bias, dst, &info); + case GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT: + return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(src, bias, dst, &info); + default: + return ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Unsupported GEMMLowpOutputStage type."); + } +} + +void ClGemmLowpOutputStage::run(ITensorPack &tensors) +{ + const ITensor *src = tensors.get_const_tensor(ACL_SRC); + const ITensor *bias = tensors.get_const_tensor(ACL_BIAS); + ITensor *dst = tensors.get_tensor(ACL_DST); + + ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } }; + CLScheduler::get().enqueue_op(*_kernel, pack, true); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h new file mode 100644 index 0000000000..3f1b04dcce --- /dev/null +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h @@ -0,0 +1,88 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H +#define ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +/** This file contains all available output stages for GEMMLowp on OpenCL. + * + * In gemmlowp, the "output stage" is the process that takes a final int32 accumulator value (the output of @ref CLGEMMLowpMatrixMultiplyCore), + * and processes it to obtain the final QASYMM8/QASYMM8_SIGNED value. + * + * More information about the GEMMLowp output stage can be found at https://github.com/google/gemmlowp/blob/master/doc/output.md + */ + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to execute GEMMLowpQuantizeDown kernels on CL. + * + * This function calls the following CL kernels: + * + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFloatKernel + * -# @ref opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel +*/ +class ClGemmLowpOutputStage : public IClOperator +{ +public: + /** Constructor */ + ClGemmLowpOutputStage() = default; + /** Initialise the kernel's inputs, output + * + * Valid data layouts: + * - All + * + * Valid data type configurations: + * |src0 |src1 |dst | + * |:--------------|:-------------|:-------------| + * |S32 |S32 |QASYMM8 | + * |S32 |S32 |QASYMM8_SIGNED| + * |S32 |S32 |QSYMM16 | + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. Data type supported: S32 + * @param[in] bias Biases tensor. Only shared biases supported and it can be a nullptr if the biases addition is not required. + * Biases are 1D tensor with dimensions [OFM]. Data type supported: Same as @p src. + * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED + * @param[in] info GEMMLowp output stage metadata. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClGemmLowpOutputStage::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_GEMMLOWP_OUTPUT_STAGE_H */ diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp new file mode 100644 index 0000000000..b909066e4c --- /dev/null +++ b/src/gpu/cl/operators/ClLogicalNot.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2017-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClLogicalNot.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClLogicalNot::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClElementWiseUnaryKernel>(); + k->configure(compile_context, src, dst, ElementWiseUnary::LOGICAL_NOT); + _kernel = std::move(k); +} + +Status ClLogicalNot::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClElementWiseUnaryKernel::validate(src, dst, ElementWiseUnary::LOGICAL_NOT); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClLogicalNot.h b/src/gpu/cl/operators/ClLogicalNot.h new file mode 100644 index 0000000000..31d4a99be6 --- /dev/null +++ b/src/gpu/cl/operators/ClLogicalNot.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_LOGICAL_NOT_H +#define ARM_COMPUTE_CL_LOGICAL_NOT_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClElementWiseUnaryKernel for NOT operation */ +class ClLogicalNot : public IClOperator +{ +public: + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: U8. + * @param[out] dst Destination tensor info. Data types supported: same as @p src. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClLogicalNot::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_LOGICAL_NOT_H */ diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp new file mode 100644 index 0000000000..59d2b96bee --- /dev/null +++ b/src/gpu/cl/operators/ClMul.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClMul.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClMulKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClMulKernel>(); + k->configure(compile_context, src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); + _kernel = std::move(k); +} + +Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +{ + return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); +} + +void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClComplexMulKernel>(); + k->configure(compile_context, src1, src2, dst, act_info); + _kernel = std::move(k); +} + +Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +{ + return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h new file mode 100644 index 0000000000..6a158c910d --- /dev/null +++ b/src/gpu/cl/operators/ClMul.h @@ -0,0 +1,103 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_MUL_H +#define ARM_COMPUTE_CL_MUL_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref opencl::kernels::ClMulKernel */ +class ClMul : public IClOperator +{ +public: + /** Initialise the kernel's sources, dst and convertion policy. + * + * Valid configurations (src1,src2) -> Output : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,U8) -> S16 + * - (S16,S16) -> S16 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * - (QSYMM16,QSYMM16) -> S32 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 An src tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/F16/F32. + * @param[in] scale Scale to apply after multiplication. + * Scale must be positive and its value must be either 1/255 or 1/2^n where n is between 0 and 15. + * @param[in] overflow_policy Overflow policy. Supported overflow policies: Wrap, Saturate + * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, + ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; + +/** Basic function to run @ref opencl::kernels::ClComplexMulKernel */ +class ClComplexMul : public IClOperator +{ +public: + /** Initialise the kernel's sources, dst. + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 An src tensor info. Data types supported: F16/F32. Number of channels supported: 2. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 An src tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * The src tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClComplexMul::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_MUL_H */
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp new file mode 100644 index 0000000000..05717d5bb7 --- /dev/null +++ b/src/gpu/cl/operators/ClPRelu.cpp @@ -0,0 +1,57 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClPRelu.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +using KernelType = kernels::ClArithmeticKernel; +void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output) +{ + auto k = std::make_unique<KernelType>(); + k->configure(compile_context, ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output)); + _kernel = std::move(k); +} + +Status ClPRelu::validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output) +{ + return KernelType::validate(ArithmeticOperation::PRELU, input, alpha, (output == nullptr ? input : output)); +} + +void ClPRelu::run(ITensorPack &tensors) +{ + // Output tensor can be given as nullptr for in-place computation. + // In this case, get the input tensor and use it as the output tensor. + if(tensors.get_tensor(TensorType::ACL_DST) == nullptr) + { + auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation"); + tensors.add_tensor(TensorType::ACL_DST, src_tensor); + } + IClOperator::run(tensors); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h new file mode 100644 index 0000000000..8084ab86cd --- /dev/null +++ b/src/gpu/cl/operators/ClPRelu.h @@ -0,0 +1,64 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_PRELU_H +#define ARM_COMPUTE_CL_PRELU_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic operator to run @ref arm_compute::opencl::kernels::ClArithmeticKernel for PRELU + * + * @note The operator implements an activation layer with the PRELU activation function. + */ +class ClPRelu : public IClOperator +{ +public: + /** Set the input and output tensor. + * + * @note If the output tensor is a nullptr or is equal to the input, the activation function will be performed in-place + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Source tensor. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. + * @param[out] output Destination tensor. Data type supported: same as @p input + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPRelu::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *alpha, const ITensorInfo *output); + + // Inherited methods overridden: + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_PRELU_H */ diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp new file mode 100644 index 0000000000..ed74e22b6c --- /dev/null +++ b/src/gpu/cl/operators/ClPermute.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClPermute.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClPermuteKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +{ + auto k = std::make_unique<kernels::ClPermuteKernel>(); + k->configure(compile_context, src, dst, perm); + _kernel = std::move(k); +} + +Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm) +{ + return kernels::ClPermuteKernel::validate(src, dst, perm); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h new file mode 100644 index 0000000000..3e87329f9b --- /dev/null +++ b/src/gpu/cl/operators/ClPermute.h @@ -0,0 +1,58 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_PERMUTE_H +#define ARM_COMPUTE_CL_PERMUTE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClPermuteKernel */ +class ClPermute : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs and permute vector + * + * @note Arbitrary permutation vectors are supported with rank not greater than 4 + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + * @param[in] perm Permutation vector + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPermute::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PermutationVector &perm); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_PERMUTE_H */
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp new file mode 100644 index 0000000000..fdadd199fc --- /dev/null +++ b/src/gpu/cl/operators/ClPool2d.cpp @@ -0,0 +1,101 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClPool2d.h" + +#include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClPool2dKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + // Configure pooling kernel + auto k = std::make_unique<kernels::ClPool2dKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, dst, info, indices); + _pooling = std::move(k); + + const DataType data_type = src->data_type(); + + // Configure border depending on operation required (quantize border in case of asymmetric data_type) + BorderMode border_mode{}; + PixelValue pixel_value(0.f); + if(is_data_type_quantized_asymmetric(data_type) && !info.exclude_padding) + { + pixel_value = PixelValue(0, data_type, src->quantization_info()); + } + + // Data layout + const auto data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; + + switch(data_layout) + { + case DataLayout::NCHW: + border_mode = (PoolingType::MAX == info.pool_type) ? BorderMode::REPLICATE : BorderMode::CONSTANT; + break; + case DataLayout::NHWC: + border_mode = BorderMode::CONSTANT; + if(PoolingType::MAX == info.pool_type) + { + if(is_data_type_quantized(data_type)) + { + std::tie(pixel_value, std::ignore) = get_min_max(data_type); + } + else + { + pixel_value = PixelValue(std::numeric_limits<float>::lowest()); + } + } + break; + default: + ARM_COMPUTE_ERROR("Data layout not supported"); + } + auto b = std::make_unique<CLFillBorderKernel>(); + b->configure(compile_context, src, _pooling->border_size(), border_mode, pixel_value); + _border_handler = std::move(b); + + // Tune kernels + CLScheduler::get().tune_kernel_static(*_pooling); +} + +Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices) +{ + return kernels::ClPool2dKernel::validate(src, dst, info, indices); +} + +void ClPool2d::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + + CLScheduler::get().enqueue_op(*_border_handler.get(), tensors, false); + CLScheduler::get().enqueue_op(*_pooling.get(), tensors, false); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h new file mode 100644 index 0000000000..a041053bb3 --- /dev/null +++ b/src/gpu/cl/operators/ClPool2d.h @@ -0,0 +1,72 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_POOL2D_H +#define ARM_COMPUTE_CL_POOL2D_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +#include <memory> + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a pooling layer with the specified pooling operation. This function calls the following OpenCL kernels: + * + * -# @ref CLFillBorderKernel (executed if padding size is different from zero) + * -# @ref opencl::ClPool2d + */ +class ClPool2d : public IClOperator +{ +public: + /** Constructor */ + ClPool2d() = default; + /** Configure operator for a given list of arguments + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. + * @param[out] dst Destination tensor info. Data type supported: same as @p src + * @param[in] info Pooling layer parameters. + * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClPool2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + +private: + std::unique_ptr<ICLKernel> _pooling{ nullptr }; + std::unique_ptr<ICLKernel> _border_handler{ nullptr }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_POOL2D_H */ diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp new file mode 100644 index 0000000000..915e0fdef0 --- /dev/null +++ b/src/gpu/cl/operators/ClQuantize.cpp @@ -0,0 +1,53 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClQuantize.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClQuantizeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClQuantize::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClQuantizeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClQuantize::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClQuantizeKernel::validate(src, dst); +} + +void ClQuantize::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + CLScheduler::get().enqueue_op(*_kernel.get(), tensors); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClQuantize.h b/src/gpu/cl/operators/ClQuantize.h new file mode 100644 index 0000000000..3e50fcefb3 --- /dev/null +++ b/src/gpu/cl/operators/ClQuantize.h @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_QUANTIZE_H +#define ARM_COMPUTE_CL_QUANTIZE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClQuantizeKernel that dequantizes an input tensor */ +class ClQuantize : public IClOperator +{ +public: + /** Set the input and output tensors. + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor. The dimensions over the third will be interpreted as batches. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/32. + * @param[out] dst Destination tensor with the same dimensions of input. Data types supported: QASYMM8/QASYMM8_SIGNED/QASYMM16. + * + * @note Output auto initialization is not supported by this function + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClQuantize::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); + + // Inherited method overridden + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_QUANTIZE_H */ diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp new file mode 100644 index 0000000000..2c1d1817d1 --- /dev/null +++ b/src/gpu/cl/operators/ClReshape.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClReshape.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClReshapeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClReshape::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClReshapeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClReshapeKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClReshape.h b/src/gpu/cl/operators/ClReshape.h new file mode 100644 index 0000000000..fee69a1c24 --- /dev/null +++ b/src/gpu/cl/operators/ClReshape.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_RESHAPE_H +#define ARM_COMPUTE_CL_RESHAPE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClReshapeKernel */ +class ClReshape : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] input Input tensor info. Data type supported: All + * @param[out] output Output info. Data type supported: Same as @p input + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClReshape::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *input, const ITensorInfo *output); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_RESHAPE_H */
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp new file mode 100644 index 0000000000..6dab66786a --- /dev/null +++ b/src/gpu/cl/operators/ClScale.cpp @@ -0,0 +1,60 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClScale.h" + +#include "arm_compute/core/Error.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClScaleKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +{ + ARM_COMPUTE_ERROR_ON_NULLPTR(src); + // Configure Scale kernel + auto k = std::make_unique<kernels::ClScaleKernel>(); + k->set_target(CLScheduler::get().target()); + k->configure(compile_context, src, dst, info); + _kernel = std::move(k); + + // Tune kernel + CLScheduler::get().tune_kernel_static(*_kernel); +} + +Status ClScale::validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info) +{ + return kernels::ClScaleKernel::validate(src, dst, info); +} + +void ClScale::run(ITensorPack &tensors) +{ + ARM_COMPUTE_ERROR_ON_MSG(tensors.empty(), "No inputs provided"); + CLScheduler::get().enqueue_op(*_kernel.get(), tensors); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h new file mode 100644 index 0000000000..af97cf23e7 --- /dev/null +++ b/src/gpu/cl/operators/ClScale.h @@ -0,0 +1,66 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_SCALE_H +#define ARM_COMPUTE_CL_SCALE_H + +#include "arm_compute/core/KernelDescriptors.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to simulate a scale layer. This function calls the following OpenCL kernels: + * + * -# @ref kernels::ClScaleKernel + */ +class ClScale : public IClOperator +{ +public: + /** Constructor */ + ClScale() = default; + /** Initialize the function's source, destination, interpolation type and border_mode. + * + * @param[in] compile_context The compile context to be used. + * @param[in,out] src Source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/F16/F32. (Written to only for @p border_mode != UNDEFINED) + * @param[out] dst Destination tensor info. Data types supported: Same as @p src + * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. + * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure + */ + void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClScale::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const ScaleKernelInfo &info); + + // Inherited method overridden + void run(ITensorPack &tensors) override; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CLSCALE_H */ diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp new file mode 100644 index 0000000000..6b728f5354 --- /dev/null +++ b/src/gpu/cl/operators/ClSoftmax.cpp @@ -0,0 +1,186 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClSoftmax.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/core/helpers/SoftmaxHelpers.h" +#include "src/gpu/cl/kernels/ClSoftmaxKernel.h" +#include "src/gpu/cl/operators/ClPermute.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "support/Cast.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace opencl +{ +ClSoftmax::ClSoftmax() + : _permute_input(std::make_unique<ClPermute>()), + _permute_output(std::make_unique<ClPermute>()), + _max_shift_exp_sum_kernel(std::make_unique<kernels::ClLogits1DMaxShiftExpSumKernel>()), + _norm_kernel(std::make_unique<kernels::ClLogits1DNormKernel>()), + _max_info(), + _sum_info(), + _tmp_info(), + _permuted_src_info(), + _permuted_dst_info(), + _aux_mem(InternalTensorIdx::COUNT) +{ +} + +void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info)); + + const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); + + _needs_permute = actual_axis != 0; + + const ITensorInfo &tmp_input_info = _needs_permute ? _permuted_src_info : src; + ITensorInfo &tmp_output_info = _needs_permute ? _permuted_dst_info : dst; + + if(_needs_permute) + { + const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info); + } + + DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); + _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); + + TensorShape max_sum_shape = tmp_input_info.tensor_shape(); + _max_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape); + _sum_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type); + + // Set GPU target to kernels + _max_shift_exp_sum_kernel->set_target(CLScheduler::get().target()); + + _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info); + _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info); + + if(_needs_permute) + { + const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info); + } + + _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); + _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); + _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); + + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size()); +} + +Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +{ + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported"); + ARM_COMPUTE_UNUSED(info.beta); + ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis); + + const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); + const bool needs_permute = actual_axis != 0; + if(needs_permute) + { + const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); + TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); + ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector)); + TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape)); + ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector)); + } + + // Create intermediate tensor info + DataType tmp_data_type = is_data_type_quantized_asymmetric(src.data_type()) ? DataType::S32 : src.data_type(); + TensorInfo tensor_info_tmp(src.clone()->set_data_type(tmp_data_type).set_is_resizable(true)); + + TensorShape max_sum_shape = src.tensor_shape(); + max_sum_shape.set(0, 1); + TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true)); + TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true)); + + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info)); + + return Status{}; +} + +void ClSoftmax::run(ITensorPack &tensors) +{ + auto src = tensors.get_const_tensor(TensorType::ACL_SRC); + auto dst = tensors.get_tensor(TensorType::ACL_DST); + + CLAuxTensorHandler sum(offset_int_vec(InternalTensorIdx::SUM), _sum_info, tensors, false); + CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false); + CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false); + + CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false); + CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false); + + if(_needs_permute) + { + ITensorPack pack; + pack.add_const_tensor(TensorType::ACL_SRC, src); + pack.add_tensor(TensorType::ACL_DST, permuted_src.get()); + _permute_input.get()->run(pack); + } + + ITensorPack sum_pack; + ITensorPack norm_pack; + if(_needs_permute) + { + sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get()); + norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get()); + } + else + { + sum_pack.add_const_tensor(TensorType::ACL_SRC, src); + norm_pack.add_tensor(TensorType::ACL_DST, dst); + } + sum_pack.add_tensor(TensorType::ACL_DST, tmp.get()); + sum_pack.add_tensor(TensorType::ACL_INT_0, max.get()); + sum_pack.add_tensor(TensorType::ACL_INT_1, sum.get()); + + norm_pack.add_const_tensor(TensorType::ACL_SRC, tmp.get()); + norm_pack.add_tensor(TensorType::ACL_INT_0, sum.get()); + + CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false); + CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false); + + if(_needs_permute) + { + ITensorPack pack; + pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get()); + pack.add_tensor(TensorType::ACL_DST, dst); + _permute_output.get()->run(pack); + } +} + +experimental::MemoryRequirements ClSoftmax::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h new file mode 100644 index 0000000000..6c9af585d6 --- /dev/null +++ b/src/gpu/cl/operators/ClSoftmax.h @@ -0,0 +1,95 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_SOFTMAX_H +#define ARM_COMPUTE_CL_SOFTMAX_H + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +struct SoftmaxKernelInfo; + +namespace opencl +{ +class ClPermute; +namespace kernels +{ +class ClLogits1DMaxShiftExpSumKernel; +class ClLogits1DNormKernel; +} // namespace kernels +class ClSoftmax : public IClOperator +{ +public: + /** Constructor */ + ClSoftmax(); + /** Configure the operator + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32 for Softmax and F16/F32 for Log Softmax + * @param[out] dst Destination tensor info. Data types supported: same as @p src + * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClSoftmax::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info); + // Inherited methods overridden: + void run(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + enum InternalTensorIdx + { + MAX = 0, + SUM, + TMP, + PERMUTED_SRC, + PERMUTED_DST, + COUNT + }; + + std::unique_ptr<ClPermute> _permute_input; + std::unique_ptr<ClPermute> _permute_output; + std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel; + std::unique_ptr<kernels::ClLogits1DNormKernel> _norm_kernel; + bool _needs_permute{ false }; + + TensorInfo _max_info; + TensorInfo _sum_info; + TensorInfo _tmp_info; + TensorInfo _permuted_src_info; + TensorInfo _permuted_dst_info; + + experimental::MemoryRequirements _aux_mem{}; +}; + +} // opencl +} // arm_compute +#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp new file mode 100644 index 0000000000..b94fef3cf9 --- /dev/null +++ b/src/gpu/cl/operators/ClSub.cpp @@ -0,0 +1,47 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClSub.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); + k->configure(compile_context, ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); + _kernel = std::move(k); +} + +Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info) +{ + return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); +} +} // namespace opencl +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h new file mode 100644 index 0000000000..902adbf39d --- /dev/null +++ b/src/gpu/cl/operators/ClSub.h @@ -0,0 +1,80 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_SUB_H +#define ARM_COMPUTE_CL_SUB_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run arithmetic subtraction + * + * @note The tensor data type for the inputs must be U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @note The function performs an arithmetic subtraction between two tensors. + */ +class ClSub : public IClOperator +{ +public: + /** Configure function for a given list of arguments. + * + * Valid configurations (src1,src2) -> dst : + * + * - (U8,U8) -> U8 + * - (U8,U8) -> S16 + * - (S16,U8) -> S16 + * - (U8,S16) -> S16 + * - (S16,S16) -> S16 + * - (S32,S32) -> S32 + * - (F16,F16) -> F16 + * - (F32,F32) -> F32 + * - (QASYMM8,QASYMM8) -> QASYMM8 + * - (QASYMM8_SIGNED,QASYMM8_SIGNED) -> QASYMM8_SIGNED + * - (QSYMM16,QSYMM16) -> QSYMM16 + * + * @param[in] compile_context The compile context to be used. + * @param[in, out] src1 First source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[in, out] src2 Second source tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * The source tensor is [in, out] because its TensorInfo might be modified inside the kernel in case of broadcasting of dimension 0. + * @param[out] dst Destination tensor info. Data types supported: U8/QASYMM8/QASYMM8_SIGNED/S16/QSYMM16/S32/F16/F32. + * @param[in] policy Policy to use to handle overflow. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to @ref ClSub::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_SUB_H */ diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp new file mode 100644 index 0000000000..6429451a42 --- /dev/null +++ b/src/gpu/cl/operators/ClTranspose.cpp @@ -0,0 +1,45 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClTranspose.h" + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClTransposeKernel.h" + +namespace arm_compute +{ +namespace opencl +{ +void ClTranspose::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst) +{ + auto k = std::make_unique<kernels::ClTransposeKernel>(); + k->configure(compile_context, src, dst); + _kernel = std::move(k); +} + +Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) +{ + return kernels::ClTransposeKernel::validate(src, dst); +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClTranspose.h b/src/gpu/cl/operators/ClTranspose.h new file mode 100644 index 0000000000..3642fc23f9 --- /dev/null +++ b/src/gpu/cl/operators/ClTranspose.h @@ -0,0 +1,55 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_TRANSPOSE_H +#define ARM_COMPUTE_CL_TRANSPOSE_H + +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" + +namespace arm_compute +{ +namespace opencl +{ +/** Basic function to run @ref kernels::ClTransposeKernel */ +class ClTranspose : public IClOperator +{ +public: + /** Initialise the kernel's inputs and outputs + * + * @param[in] compile_context The compile context to be used. + * @param[in] src The src tensor info. Data types supported: All. + * @param[in] dst The dst tensor info. Data types supported: Same as @p src + */ + void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClTranspose::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *dst); +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_TRANSPOSE_H */ diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp new file mode 100644 index 0000000000..fbf6442a80 --- /dev/null +++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp @@ -0,0 +1,306 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#include "src/gpu/cl/operators/ClWinogradConv2d.h" + +#include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/core/helpers/MemoryHelpers.h" +#include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" +#include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h" +#include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" +#include "support/Cast.h" + +using namespace arm_compute::experimental; + +namespace arm_compute +{ +namespace opencl +{ +namespace +{ +Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, DataLayout data_layout) +{ + Size2D output_tile = Size2D{}; + + const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); + + // Check if the input spatial dimensions are smaller than 4 + const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); + + if(kernel_max_dim == 3U) + { + if(kernel_dims == Size2D(3U, 3U)) + { + output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U); + } + else if(kernel_dims == Size2D(3U, 1U)) + { + output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U); + } + else + { + output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U); + } + } + else if(kernel_max_dim == 5U) + { + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, + kernel_dims.height == 1 ? 1U : 4U); + } + else if(kernel_max_dim == 7U) + { + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, + kernel_dims.height == 1 ? 1U : 2U); + } + + return output_tile; +} + +bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_size) +{ + // Check if we want to configure a Winograd configuration which requires fast math + using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; + + std::vector<WinogradConfiguration> fast_math_winograd = + { + WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), + WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)) + }; + + auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), + std::pair<int, int>(kernel_size.width, kernel_size.height)); + + return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); +} + +Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, bool enable_fast_math) +{ + // Get indeces for the width and height + const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + + // Input shape, kernel size and output tile + const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]); + const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); + const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size"); + + // Check if the Winograd configuration requires fast math + if(!enable_fast_math) + { + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + } + + const WinogradInfo winograd_info = WinogradInfo(output_tile, + kernel_size, + input_dims, + conv_info, + src->data_layout()); + + // Validate input transform + const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info)); + + // Validate filter transform + const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); + const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info)); + + // Validate batched matrix multiply + TensorShape batched_mm_output_shape = input0.tensor_shape(); + batched_mm_output_shape[0] = input1.tensor_shape()[0]; + const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); + + // Configure output transform + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); + return Status{}; +} + +} // namespace + +ClWinogradConv2d::ClWinogradConv2d() + : _batched_mm(), + _input_transform(std::make_unique<kernels::ClWinogradInputTransformKernel>()), + _filter_transform(std::make_unique<kernels::ClWinogradFilterTransformKernel>()), + _output_transform(std::make_unique<kernels::ClWinogradOutputTransformKernel>()), + _border_handler(), + _input0(), + _input1(), + _batched_mm_output(), + _is_prepared(false), + _aux_mem() +{ +} + +ClWinogradConv2d::~ClWinogradConv2d() = default; + +void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +{ + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); + // Get indices for the width and height + const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); + const size_t idx_height = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::HEIGHT); + + // Input shape, kernel size and output tile + const Size2D input_dims = Size2D(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height]); + const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); + const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); + + // Check if the Winograd configuration requires fast math + if(!enable_fast_math) + { + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + } + const WinogradInfo winograd_info = WinogradInfo(output_tile, + kernel_size, + input_dims, + conv_info, + src->data_layout()); + + _is_prepared = false; + + // Configure input transform + _input_transform->configure(compile_context, src, &_input0, winograd_info); + _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue()); + + // Configure filter transform + _filter_transform->configure(compile_context, weights, &_input1, winograd_info); + + // Configure batched matrix multiply + _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, + false, false, + GEMMLowpOutputStageInfo(), + (src->data_type() == DataType::F16))); + + // Configure output transform + _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info); + + _aux_mem = _batched_mm.workspace(); + const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r) + { + return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); + }) ? + MemoryLifetime::Prepare : + MemoryLifetime::Persistent; + _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size())); + _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size())); + _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size())); +} + +Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, bool enable_fast_math) +{ + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); + return Status{}; +} + +void ClWinogradConv2d::run(ITensorPack &tensors) +{ + const bool is_gemm_reshaped = _aux_mem[3].lifetime == MemoryLifetime::Prepare; + + auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + CLAuxTensorHandler input0(offset_int_vec(2), _input0, tensors, true); + CLAuxTensorHandler input1(offset_int_vec(3), _input1, tensors, true, is_gemm_reshaped); + CLAuxTensorHandler batched_mm_output(offset_int_vec(4), _batched_mm_output, tensors, true); + + prepare(tensors); + + // Run input transform + ITensorPack pack_it + { + { TensorType::ACL_SRC, src }, + { TensorType::ACL_DST, input0.get() }, + }; + CLScheduler::get().enqueue_op(_border_handler, pack_it, false); + CLScheduler::get().enqueue_op(*_input_transform, pack_it, false); + + // Run batched matrix multiplication + ITensorPack pack_mm = tensors; + pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get()); + pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get()); + is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); + _batched_mm.run(pack_mm); + + // Run output transform + ITensorPack pack_ot + { + { TensorType::ACL_SRC_0, batched_mm_output.get() }, + { TensorType::ACL_SRC_1, biases }, + { TensorType::ACL_DST, dst }, + }; + CLScheduler::get().enqueue_op(*_output_transform, pack_ot); +} + +void ClWinogradConv2d::prepare(ITensorPack &tensors) +{ + if(!_is_prepared) + { + auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3))); + + CLAuxTensorHandler input1(_input1, *in1_aux); + ITensorPack pack_ft + { + { TensorType::ACL_SRC, weights }, + { TensorType::ACL_DST, input1.get() }, + }; + // Run filter transform and mark original weights as unused + CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false); + weights->mark_as_unused(); + + // Prepare GEMM and release reshaped weights if marked unused by ClGemm + ITensorPack mm_prepare_pack = tensors; + mm_prepare_pack.add_tensor(ACL_SRC_1, input1.get()); + _batched_mm.prepare(mm_prepare_pack); + + CLScheduler::get().queue().finish(); + _is_prepared = true; + } +} + +experimental::MemoryRequirements ClWinogradConv2d::workspace() const +{ + return _aux_mem; +} +} // namespace opencl +} // namespace arm_compute
\ No newline at end of file diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h new file mode 100644 index 0000000000..eb2f7a72b2 --- /dev/null +++ b/src/gpu/cl/operators/ClWinogradConv2d.h @@ -0,0 +1,126 @@ +/* + * Copyright (c) 2018-2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_WINOGRADCONV2D_H +#define ARM_COMPUTE_CL_WINOGRADCONV2D_H + +#include "arm_compute/runtime/CL/CLTensor.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/IClOperator.h" +#include "src/gpu/cl/operators/ClGemm.h" + +namespace arm_compute +{ +class CLCompileContext; +class ITensorInfo; +namespace opencl +{ +namespace kernels +{ +class ClWinogradInputTransformKernel; +class ClWinogradFilterTransformKernel; +class ClWinogradOutputTransformKernel; +} // kernels +/** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: + * + * -# @ref kernels::ClWinogradInputTransformKernel + * -# @ref kernels::ClWinogradFilterTransformKernel (only once) + * -# @ref ClGemm + * -# @ref kernels::ClWinogradOutputTransformKernel + * + */ +class ClWinogradConv2d : public IClOperator +{ +public: + /** Default constructor */ + ClWinogradConv2d(); + /** Default destructor */ + ~ClWinogradConv2d(); + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClWinogradConv2d(const ClWinogradConv2d &) = delete; + /** Default move constructor */ + ClWinogradConv2d(ClWinogradConv2d &&) = default; + /** Prevent instances of this class from being copied (As this class contains pointers) */ + ClWinogradConv2d &operator=(const ClWinogradConv2d &) = delete; + /** Default move assignment operator */ + ClWinogradConv2d &operator=(ClWinogradConv2d &&) = default; + /** Set the input and output tensors. + * + * Valid data layouts: + * - NHWC + * - NCHW + * + * Valid data type configurations: + * |src0 |src1 |src2 |dst | + * |:--------------|:--------------|:------|:--------------| + * |F16 |F16 |F16 |F16 | + * |F32 |F32 |F32 |F32 | + * + * @note: This function only works with 3x3,3x1,1x3,5x5,5x1,1x5,7x1 and 1x7 kernels along with unit strides for both NCHW and NHWC data layout + * @note Some Winograd configurations (i.e. F(4x4, 5x5)) are supported only with enable_fast_math = true + * + * @param[in] compile_context The compile context to be used. + * @param[in] src Source tensor info. 3 lower dimensions represent a single input [width, height, IFM], + * while every optional dimension from 4 and above represent a batch of inputs. + * Data types supported: F16/F32. + * @param[in] weights Weights tensor info. Weights are 4D tensor with dimensions [kernel_x, kernel_y, IFM, OFM]. Data type supported:Same as @p src. + * @param[in] biases Biases tensor info. Shared biases supported. Biases are 1D tensor with dimensions [OFM].Data type supported: Same as @p src + * @param[out] dst Destination tensor info. 3 lower dimensions represent a single output [width, height, OFM], while the rest represent batch of outputs. + * Data types supported: Same as @p src. + * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. + * @param[in] act_info (Optional) Activation layer information in case of a fused activation. + * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation + * available which may introduce a drop of accuracy as well. Default is false + */ + void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + /** Static function to check if given info will lead to a valid configuration + * + * Similar to ClWinogradConv2d::configure() + * + * @return a status + */ + static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + + // Inherited method overridden + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; + experimental::MemoryRequirements workspace() const override; + +private: + ClGemm _batched_mm; + std::unique_ptr<kernels::ClWinogradInputTransformKernel> _input_transform; + std::unique_ptr<kernels::ClWinogradFilterTransformKernel> _filter_transform; + std::unique_ptr<kernels::ClWinogradOutputTransformKernel> _output_transform; + CLFillBorderKernel _border_handler; + TensorInfo _input0; + TensorInfo _input1; + TensorInfo _batched_mm_output; + bool _is_prepared; + experimental::MemoryRequirements _aux_mem{}; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_WINOGRADCONV2D_H */ diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h new file mode 100644 index 0000000000..af383489a1 --- /dev/null +++ b/src/gpu/cl/utils/ClAuxTensorHandler.h @@ -0,0 +1,111 @@ +/* + * Copyright (c) 2021 Arm Limited. + * + * SPDX-License-Identifier: MIT + * + * Permission is hereby granted, free of charge, to any person obtaining a copy + * of this software and associated documentation files (the "Software"), to + * deal in the Software without restriction, including without limitation the + * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or + * sell copies of the Software, and to permit persons to whom the Software is + * furnished to do so, subject to the following conditions: + * + * The above copyright notice and this permission notice shall be included in all + * copies or substantial portions of the Software. + * + * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR + * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, + * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE + * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER + * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, + * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE + * SOFTWARE. + */ +#ifndef ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H +#define ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H + +#include "arm_compute/core/ITensorPack.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/runtime/CL/CLTensor.h" + +#include "src/common/utils/Log.h" +#include "support/Cast.h" + +namespace arm_compute +{ +namespace opencl +{ +/* Tensor handler to wrap and handle tensor allocations on workspace buffers */ +class CLAuxTensorHandler +{ +public: + CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) + : _tensor() + { + if(info.total_size() == 0) + { + return; + } + _tensor.allocator()->soft_init(info); + + ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id)); + if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) + { + if(!bypass_alloc) + { + _tensor.allocator()->allocate(); + ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); + } + + if(pack_inject) + { + pack.add_tensor(slot_id, &_tensor); + _injected_tensor_pack = &pack; + _injected_slot_id = slot_id; + } + } + else + { + _tensor.allocator()->import_memory(packed_tensor->cl_buffer()); + } + } + + CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) + : _tensor() + { + _tensor.allocator()->soft_init(info); + if(info.total_size() <= tensor.info()->total_size()) + { + _tensor.allocator()->import_memory(tensor.cl_buffer()); + } + } + + CLAuxTensorHandler(const CLAuxTensorHandler &) = delete; + CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete; + + ~CLAuxTensorHandler() + { + if(_injected_tensor_pack) + { + _injected_tensor_pack->remove_tensor(_injected_slot_id); + } + } + + ICLTensor *get() + { + return &_tensor; + } + + ICLTensor *operator()() + { + return &_tensor; + } + +private: + CLTensor _tensor{}; + ITensorPack *_injected_tensor_pack{ nullptr }; + int _injected_slot_id{ TensorType::ACL_UNKNOWN }; +}; +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
\ No newline at end of file |