diff options
Diffstat (limited to 'src/gpu/cl')
206 files changed, 6860 insertions, 4357 deletions
diff --git a/src/gpu/cl/ClContext.cpp b/src/gpu/cl/ClContext.cpp index d8ef18e62e..611c1cb501 100644 --- a/src/gpu/cl/ClContext.cpp +++ b/src/gpu/cl/ClContext.cpp @@ -23,11 +23,11 @@ */ #include "src/gpu/cl/ClContext.h" +#include "arm_compute/core/CL/CLKernelLibrary.h" + #include "src/gpu/cl/ClQueue.h" #include "src/gpu/cl/ClTensor.h" -#include "arm_compute/core/CL/CLKernelLibrary.h" - namespace arm_compute { namespace gpu @@ -41,7 +41,7 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename) bool status = false; mlgo::MLGOHeuristics heuristics; - if(filename != nullptr) + if (filename != nullptr) { status = heuristics.reload_from_file(filename); } @@ -50,12 +50,9 @@ mlgo::MLGOHeuristics populate_mlgo(const char *filename) } // namespace ClContext::ClContext(const AclContextOptions *options) - : IContext(Target::GpuOcl), - _mlgo_heuristics(), - _cl_ctx(), - _cl_dev() + : IContext(Target::GpuOcl), _mlgo_heuristics(), _cl_ctx(), _cl_dev() { - if(options != nullptr) + if (options != nullptr) { _mlgo_heuristics = populate_mlgo(options->kernel_config_file); } @@ -80,7 +77,7 @@ const mlgo::MLGOHeuristics &ClContext::mlgo() const bool ClContext::set_cl_ctx(::cl::Context ctx) { - if(this->refcount() == 0) + if (this->refcount() == 0) { _cl_ctx = ctx; CLScheduler::get().set_context(ctx); @@ -92,7 +89,7 @@ bool ClContext::set_cl_ctx(::cl::Context ctx) ITensorV2 *ClContext::create_tensor(const AclTensorDescriptor &desc, bool allocate) { ClTensor *tensor = new ClTensor(this, desc); - if(tensor != nullptr && allocate) + if (tensor != nullptr && allocate) { tensor->allocate(); } diff --git a/src/gpu/cl/ClContext.h b/src/gpu/cl/ClContext.h index a50b03124b..2c67ccf4d2 100644 --- a/src/gpu/cl/ClContext.h +++ b/src/gpu/cl/ClContext.h @@ -24,11 +24,11 @@ #ifndef SRC_GPU_CLCONTEXT_H #define SRC_GPU_CLCONTEXT_H +#include "arm_compute/core/CL/OpenCL.h" + #include "src/common/IContext.h" #include "src/runtime/CL/mlgo/MLGOHeuristics.h" -#include "arm_compute/core/CL/OpenCL.h" - namespace arm_compute { namespace gpu @@ -74,9 +74,9 @@ public: bool set_cl_ctx(::cl::Context ctx); // Inherrited methods overridden - ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; - IQueue *create_queue(const AclQueueOptions *options) override; - std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src, + ITensorV2 *create_tensor(const AclTensorDescriptor &desc, bool allocate) override; + IQueue *create_queue(const AclQueueOptions *options) override; + std::tuple<IOperator *, StatusCode> create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) override; @@ -90,4 +90,4 @@ private: } // namespace gpu } // namespace arm_compute -#endif /* SRC_GPU_CLCONTEXT_H */
\ No newline at end of file +#endif /* SRC_GPU_CLCONTEXT_H */ diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp index 73bb96298e..bcade94522 100644 --- a/src/gpu/cl/ClKernelLibrary.cpp +++ b/src/gpu/cl/ClKernelLibrary.cpp @@ -37,24 +37,16 @@ namespace { /* Decoding table */ -constexpr std::array<uint8_t, 256> b64_invtab = -{ - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, - 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, 0, 0, 0, 0, 0, 0, - 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, - 15, 16, 17, 18, 19, 20, 21, 22, 23, 24, 25, 0, 0, 0, 0, 0, - 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, - 41, 42, 43, 44, 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, - 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, +constexpr std::array<uint8_t, 256> b64_invtab = { + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 62, 0, 0, 0, 63, 52, 53, 54, 55, 56, 57, 58, 59, 60, 61, + 0, 0, 0, 0, 0, 0, 0, 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, 21, + 22, 23, 24, 25, 0, 0, 0, 0, 0, 0, 26, 27, 28, 29, 30, 31, 32, 33, 34, 35, 36, 37, 38, 39, 40, 41, 42, 43, 44, + 45, 46, 47, 48, 49, 50, 51, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, + 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, }; /** Decode a base64 encoded string @@ -68,13 +60,13 @@ std::string decode_base64(const std::string &str) constexpr const char pad_char = '='; // Handle empty string - if(str.empty()) + if (str.empty()) { return {}; } // Base64 encoded string has size multiple of 4 - if(str.length() % 4) + if (str.length() % 4) { return {}; } @@ -92,7 +84,7 @@ std::string decode_base64(const std::string &str) // Block decoding function (exclude padding) int c = 0; const int end = str_len - 4 - padding; - for(; c <= end; c += 4) + for (; c <= end; c += 4) { const int byte0 = b64_invtab[str[c]]; const int byte1 = b64_invtab[str[c + 1]]; @@ -105,7 +97,7 @@ std::string decode_base64(const std::string &str) } // Last step that might contain padding symbols - if(padding == 1) + if (padding == 1) { const int byte0 = b64_invtab[str[c]]; const int byte1 = b64_invtab[str[c + 1]]; @@ -114,7 +106,7 @@ std::string decode_base64(const std::string &str) dec_b64.push_back((byte0 << 2) | (byte1 >> 4)); dec_b64.push_back((byte1 << 4) | (byte2 >> 2)); } - else if(padding == 2) + else if (padding == 2) { const int byte0 = b64_invtab[str[c]]; const int byte1 = b64_invtab[str[c + 1]]; @@ -135,7 +127,7 @@ std::string decompress_zlib(const std::string &str) { // Create and initialize decompression stream z_stream ds{}; - if(inflateInit(&ds) != Z_OK) + if (inflateInit(&ds) != Z_OK) { return std::string(); } @@ -152,16 +144,15 @@ std::string decompress_zlib(const std::string &str) ds.next_out = reinterpret_cast<Bytef *>(roll_buff); status = inflate(&ds, 0); - if(inflated_str.size() < ds.total_out) + if (inflated_str.size() < ds.total_out) { inflated_str.append(roll_buff, ds.total_out - inflated_str.size()); } - } - while(status == Z_OK); + } while (status == Z_OK); // Finalize decompression stream inflateEnd(&ds); - if(status != Z_STREAM_END) + if (status != Z_STREAM_END) { return std::string(); } @@ -175,323 +166,321 @@ namespace arm_compute { namespace opencl { -const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = -{ +const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map = { // Common Kernels - { "activation_layer", "common/activation_layer.cl" }, - { "activation_layer_quant", "common/activation_layer_quant.cl" }, - { "activation_layer_quant_f32", "common/activation_layer_quant.cl" }, - { "arg_min_max_x", "common/arg_min_max.cl" }, - { "arg_min_max_y", "common/arg_min_max.cl" }, - { "arg_min_max_z", "common/arg_min_max.cl" }, - { "arg_min_max_w", "common/arg_min_max.cl" }, - { "bitwise_or", "common/bitwise_op.cl" }, - { "bitwise_and", "common/bitwise_op.cl" }, - { "bitwise_xor", "common/bitwise_op.cl" }, - { "bitwise_not", "common/bitwise_op.cl" }, - { "bounding_box_transform", "common/bounding_box_transform.cl" }, - { "bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl" }, - { "compare_equal", "common/comparisons.cl" }, - { "compare_equal_quantized", "common/comparisons.cl" }, - { "compare_notequal", "common/comparisons.cl" }, - { "compare_notequal_quantized", "common/comparisons.cl" }, - { "compare_greater", "common/comparisons.cl" }, - { "compare_greater_quantized", "common/comparisons.cl" }, - { "compare_greaterequal", "common/comparisons.cl" }, - { "compare_greaterequal_quantized", "common/comparisons.cl" }, - { "compare_less", "common/comparisons.cl" }, - { "compare_less_quantized", "common/comparisons.cl" }, - { "compare_lessequal", "common/comparisons.cl" }, - { "compare_lessequal_quantized", "common/comparisons.cl" }, - { "concatenate", "common/concatenate.cl" }, - { "concatenate_width", "common/concatenate.cl" }, - { "concatenate_height", "common/concatenate.cl" }, - { "concatenate_width_x2", "common/concatenate.cl" }, - { "concatenate_width_x4", "common/concatenate.cl" }, - { "col2im", "common/col2im.cl" }, - { "cast_down", "common/cast.cl" }, - { "cast_up", "common/cast.cl" }, - { "convert_fc_weights", "common/convert_fc_weights.cl" }, - { "copy_tensor", "common/copy_tensor.cl" }, - { "crop_tensor", "common/crop_tensor.cl" }, - { "deconvolution_reshape", "common/deconvolution_layer.cl" }, - { "deconvolution_upsample", "common/deconvolution_layer.cl" }, - { "dequantization_layer", "common/dequantization_layer.cl" }, - { "elementwise_operation_ADD", "common/elementwise_operation.cl" }, - { "elementwise_operation_SUB", "common/elementwise_operation.cl" }, - { "elementwise_operation_MAX", "common/elementwise_operation.cl" }, - { "elementwise_operation_MIN", "common/elementwise_operation.cl" }, - { "elementwise_operation_DIV", "common/elementwise_operation.cl" }, - { "elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl" }, - { "elementwise_operation_POWER", "common/elementwise_operation.cl" }, - { "elementwise_operation_PRELU", "common/elementwise_operation.cl" }, - { "elementwise_operation_AND", "common/elementwise_operation.cl" }, - { "elementwise_operation_OR", "common/elementwise_operation.cl" }, - { "elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" }, - { "elementwise_unary", "common/elementwise_unary.cl" }, - { "elementwise_unary_quantized", "common/elementwise_unary_quantized.cl" }, - { "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" }, - { "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" }, - { "fft_radix_2_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_2_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_2_axis_0", "common/fft.cl" }, - { "fft_radix_2_axis_1", "common/fft.cl" }, - { "fft_radix_3_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_3_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_3_axis_0", "common/fft.cl" }, - { "fft_radix_3_axis_1", "common/fft.cl" }, - { "fft_radix_4_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_4_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_4_axis_0", "common/fft.cl" }, - { "fft_radix_4_axis_1", "common/fft.cl" }, - { "fft_radix_5_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_5_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_5_axis_0", "common/fft.cl" }, - { "fft_radix_5_axis_1", "common/fft.cl" }, - { "fft_radix_7_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_7_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_7_axis_0", "common/fft.cl" }, - { "fft_radix_7_axis_1", "common/fft.cl" }, - { "fft_radix_8_first_stage_axis_0", "common/fft.cl" }, - { "fft_radix_8_first_stage_axis_1", "common/fft.cl" }, - { "fft_radix_8_axis_0", "common/fft.cl" }, - { "fft_radix_8_axis_1", "common/fft.cl" }, - { "fft_scale_conj", "common/fft_scale.cl" }, - { "fill_image_borders_constant", "common/fill_border.cl" }, - { "fill_image_borders_replicate", "common/fill_border.cl" }, - { "floor_layer", "common/floor.cl" }, - { "fuse_batchnormalization_layer", "common/batchnormalization_layer.cl" }, - { "gather", "common/gather.cl" }, - { "gemm_ma_f16", "common/gemm.cl" }, - { "gemm_ma_f32", "common/gemm.cl" }, - { "gemm_mv", "common/gemv.cl" }, - { "gemm_mv_quantized", "common/gemv.cl" }, - { "gemm_mm_native", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl" }, - { "gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t", "common/gemm.cl" }, - { "gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl" }, - { "gemm_lc_vm_f32", "common/gemm.cl" }, - { "gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl" }, - { "gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl" }, - { "gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl" }, - { "gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl" }, - { "gemmlowp_matrix_a_reduction", "common/gemmlowp.cl" }, - { "gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl" }, - { "gemmlowp_matrix_b_reduction", "common/gemmlowp.cl" }, - { "gemmlowp_mm_native", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl" }, - { "gemmlowp_offset_contribution", "common/gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl" }, - { "gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl" }, - { "gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl" }, - { "generate_proposals_compute_all_anchors", "common/generate_proposals.cl" }, - { "generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl" }, - { "instance_normalization", "common/instance_normalization.cl" }, - { "compute_mean_var", "common/instance_normalization.cl" }, - { "l2_normalize_x", "common/l2_normalize.cl" }, - { "l2_normalize_y", "common/l2_normalize.cl" }, - { "l2_normalize_z", "common/l2_normalize.cl" }, - { "mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl" }, - { "mat_mul_native_nt_nt", "common/mat_mul.cl" }, - { "mat_mul_native_nt_t", "common/mat_mul.cl" }, - { "mat_mul_native_t_nt", "common/mat_mul.cl" }, - { "mat_mul_native_t_t", "common/mat_mul.cl" }, - { "mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl" }, - { "mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl" }, - { "mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl" }, - { "mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl" }, - { "mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl" }, - { "max_unpooling_layer_2", "common/unpooling_layer.cl" }, - { "mean_stddev_normalization", "common/mean_stddev_normalization.cl" }, - { "memset", "common/memset.cl" }, - { "minmax_layer", "common/minmax_layer.cl" }, - { "non_max_suppression", "common/nonmax.cl" }, - { "pad_layer_constant", "common/pad_layer.cl" }, - { "pad_layer_symmetric_reflect", "common/pad_layer.cl" }, - { "permute", "common/permute.cl" }, - { "pixelwise_mul_complex", "common/pixelwise_mul_float.cl" }, - { "pixelwise_mul_float", "common/pixelwise_mul_float.cl" }, - { "pixelwise_mul_int", "common/pixelwise_mul_int.cl" }, - { "pixelwise_mul_quantized", "common/pixelwise_mul_int.cl" }, - { "qlstm_layer_normalization", "common/qlstm_layer_normalization.cl" }, - { "quantization_layer", "common/quantization_layer.cl" }, - { "range", "common/range.cl" }, - { "range_quantized", "common/range.cl" }, - { "reduction_operation_x", "common/reduction_operation.cl" }, - { "reduction_operation_non_parallel_x", "common/reduction_operation.cl" }, - { "reduction_operation_y", "common/reduction_operation.cl" }, - { "reduction_operation_z", "common/reduction_operation.cl" }, - { "reduction_operation_w", "common/reduction_operation.cl" }, - { "reshape_layer", "common/reshape_layer.cl" }, - { "reshape_to_columns", "common/convolution_layer.cl" }, - { "reverse", "common/reverse.cl" }, - { "roi_align_layer", "common/roi_align_layer.cl" }, - { "roi_align_layer_quantized", "common/roi_align_layer_quantized.cl" }, - { "roi_pooling_layer", "common/roi_pooling_layer.cl" }, - { "select_same_rank", "common/select.cl" }, - { "select_different_rank_2", "common/select.cl" }, - { "select_different_rank_n", "common/select.cl" }, - { "softmax_layer_norm", "common/softmax_layer.cl" }, - { "softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl" }, - { "softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl" }, - { "softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl" }, - { "stack_layer", "common/stack_layer.cl" }, - { "strided_slice", "common/slice_ops.cl" }, - { "tile", "common/tile.cl" }, - { "transpose", "common/transpose.cl" }, + {"activation_layer", "common/activation_layer.cl"}, + {"activation_layer_quant", "common/activation_layer_quant.cl"}, + {"activation_layer_quant_f32", "common/activation_layer_quant.cl"}, + {"arg_min_max_x", "common/arg_min_max.cl"}, + {"arg_min_max_y", "common/arg_min_max.cl"}, + {"arg_min_max_z", "common/arg_min_max.cl"}, + {"arg_min_max_w", "common/arg_min_max.cl"}, + {"bitwise_or", "common/bitwise_op.cl"}, + {"bitwise_and", "common/bitwise_op.cl"}, + {"bitwise_xor", "common/bitwise_op.cl"}, + {"bitwise_not", "common/bitwise_op.cl"}, + {"bounding_box_transform", "common/bounding_box_transform.cl"}, + {"bounding_box_transform_quantized", "common/bounding_box_transform_quantized.cl"}, + {"compare_equal", "common/comparisons.cl"}, + {"compare_equal_quantized", "common/comparisons.cl"}, + {"compare_notequal", "common/comparisons.cl"}, + {"compare_notequal_quantized", "common/comparisons.cl"}, + {"compare_greater", "common/comparisons.cl"}, + {"compare_greater_quantized", "common/comparisons.cl"}, + {"compare_greaterequal", "common/comparisons.cl"}, + {"compare_greaterequal_quantized", "common/comparisons.cl"}, + {"compare_less", "common/comparisons.cl"}, + {"compare_less_quantized", "common/comparisons.cl"}, + {"compare_lessequal", "common/comparisons.cl"}, + {"compare_lessequal_quantized", "common/comparisons.cl"}, + {"concatenate", "common/concatenate.cl"}, + {"concatenate_width", "common/concatenate.cl"}, + {"concatenate_height", "common/concatenate.cl"}, + {"concatenate_width_x2", "common/concatenate.cl"}, + {"concatenate_width_x4", "common/concatenate.cl"}, + {"col2im", "common/col2im.cl"}, + {"cast_down", "common/cast.cl"}, + {"cast_up", "common/cast.cl"}, + {"convert_fc_weights", "common/convert_fc_weights.cl"}, + {"copy_tensor", "common/copy_tensor.cl"}, + {"crop_tensor", "common/crop_tensor.cl"}, + {"deconvolution_reshape", "common/deconvolution_layer.cl"}, + {"deconvolution_upsample", "common/deconvolution_layer.cl"}, + {"dequantization_layer", "common/dequantization_layer.cl"}, + {"elementwise_operation_ADD", "common/elementwise_operation.cl"}, + {"elementwise_operation_SUB", "common/elementwise_operation.cl"}, + {"elementwise_operation_MAX", "common/elementwise_operation.cl"}, + {"elementwise_operation_MIN", "common/elementwise_operation.cl"}, + {"elementwise_operation_DIV", "common/elementwise_operation.cl"}, + {"elementwise_operation_SQUARED_DIFF", "common/elementwise_operation.cl"}, + {"elementwise_operation_POWER", "common/elementwise_operation.cl"}, + {"elementwise_operation_PRELU", "common/elementwise_operation.cl"}, + {"elementwise_operation_AND", "common/elementwise_operation.cl"}, + {"elementwise_operation_OR", "common/elementwise_operation.cl"}, + {"elementwise_operation_ADD_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_SUB_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_MAX_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_MIN_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_DIV_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl"}, + {"elementwise_unary", "common/elementwise_unary.cl"}, + {"elementwise_unary_quantized", "common/elementwise_unary_quantized.cl"}, + {"fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl"}, + {"fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl"}, + {"fft_radix_2_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_2_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_2_axis_0", "common/fft.cl"}, + {"fft_radix_2_axis_1", "common/fft.cl"}, + {"fft_radix_3_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_3_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_3_axis_0", "common/fft.cl"}, + {"fft_radix_3_axis_1", "common/fft.cl"}, + {"fft_radix_4_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_4_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_4_axis_0", "common/fft.cl"}, + {"fft_radix_4_axis_1", "common/fft.cl"}, + {"fft_radix_5_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_5_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_5_axis_0", "common/fft.cl"}, + {"fft_radix_5_axis_1", "common/fft.cl"}, + {"fft_radix_7_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_7_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_7_axis_0", "common/fft.cl"}, + {"fft_radix_7_axis_1", "common/fft.cl"}, + {"fft_radix_8_first_stage_axis_0", "common/fft.cl"}, + {"fft_radix_8_first_stage_axis_1", "common/fft.cl"}, + {"fft_radix_8_axis_0", "common/fft.cl"}, + {"fft_radix_8_axis_1", "common/fft.cl"}, + {"fft_scale_conj", "common/fft_scale.cl"}, + {"fill_image_borders_constant", "common/fill_border.cl"}, + {"fill_image_borders_replicate", "common/fill_border.cl"}, + {"floor_layer", "common/floor.cl"}, + {"fuse_batchnormalization_layer", "common/batchnormalization_layer.cl"}, + {"gather", "common/gather.cl"}, + {"gemm_ma_f16", "common/gemm.cl"}, + {"gemm_ma_f32", "common/gemm.cl"}, + {"gemm_mv", "common/gemv.cl"}, + {"gemm_mv_quantized", "common/gemv.cl"}, + {"gemm_mm_native", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_nt_mmul", "common/gemm_reshaped_only_rhs_mmul.cl"}, + {"gemm_mm_reshaped_only_rhs_nt_mmul_texture", "common/gemm_reshaped_only_rhs_mmul.cl"}, + {"gemm_mm_reshaped_lhs_nt_rhs_t", "common/gemm.cl"}, + {"gemm_mm_reshaped_lhs_nt_rhs_t_texture", "common/gemm.cl"}, + {"gemm_mm_reshaped_lhs_t_rhs_nt", "common/gemm.cl"}, + {"gemm_mm_reshaped_lhs_t_rhs_nt_texture", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_nt", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_nt_texture", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_t", "common/gemm.cl"}, + {"gemm_mm_reshaped_only_rhs_t_texture", "common/gemm.cl"}, + {"gemm_lc_vm_f32", "common/gemm.cl"}, + {"gemm_reshape_lhs_matrix_nt", "common/gemm_utils.cl"}, + {"gemm_reshape_lhs_matrix_t", "common/gemm_utils.cl"}, + {"gemm_reshape_rhs_matrix_nt", "common/gemm_utils.cl"}, + {"gemm_reshape_rhs_matrix_t", "common/gemm_utils.cl"}, + {"gemmlowp_matrix_a_reduction", "common/gemmlowp.cl"}, + {"gemmlowp_matrix_a_reduction_dot8", "common/gemmlowp.cl"}, + {"gemmlowp_matrix_b_reduction", "common/gemmlowp.cl"}, + {"gemmlowp_mm_native", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_lhs_nt_rhs_t", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_only_rhs_t", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_only_rhs_t_fused_output_stage_fixedpoint", "common/gemmlowp.cl"}, + {"gemmlowp_mm_reshaped_only_rhs_mmul", "common/gemmlowp_reshaped_only_rhs_mmul.cl"}, + {"gemmlowp_offset_contribution", "common/gemmlowp.cl"}, + {"gemmlowp_offset_contribution_quantize_down", "common/gemmlowp.cl"}, + {"gemmlowp_offset_contribution_quantize_down_fixedpoint", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down_fixedpoint", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16", "common/gemmlowp.cl"}, + {"gemmlowp_output_stage_quantize_down_float", "common/gemmlowp.cl"}, + {"generate_proposals_compute_all_anchors", "common/generate_proposals.cl"}, + {"generate_proposals_compute_all_anchors_quantized", "common/generate_proposals_quantized.cl"}, + {"instance_normalization", "common/instance_normalization.cl"}, + {"compute_mean_var", "common/instance_normalization.cl"}, + {"l2_normalize_x", "common/l2_normalize.cl"}, + {"l2_normalize_y", "common/l2_normalize.cl"}, + {"l2_normalize_z", "common/l2_normalize.cl"}, + {"mat_mul_native_mmul_nt_nt", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_mmul_t_nt", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_mmul_nt_t", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_mmul_t_t", "common/mat_mul_mmul.cl"}, + {"mat_mul_native_nt_nt", "common/mat_mul.cl"}, + {"mat_mul_native_nt_t", "common/mat_mul.cl"}, + {"mat_mul_native_t_nt", "common/mat_mul.cl"}, + {"mat_mul_native_t_t", "common/mat_mul.cl"}, + {"mat_mul_native_quantized_nt_nt", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_nt_t", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_t_nt", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_t_t", "common/mat_mul_quantized.cl"}, + {"mat_mul_native_quantized_mmul_nt_nt", "common/mat_mul_quantized_mmul.cl"}, + {"mat_mul_native_quantized_mmul_nt_t", "common/mat_mul_quantized_mmul.cl"}, + {"mat_mul_native_quantized_mmul_t_nt", "common/mat_mul_quantized_mmul.cl"}, + {"mat_mul_native_quantized_mmul_t_t", "common/mat_mul_quantized_mmul.cl"}, + {"max_unpooling_layer_2", "common/unpooling_layer.cl"}, + {"mean_stddev_normalization", "common/mean_stddev_normalization.cl"}, + {"memset", "common/memset.cl"}, + {"minmax_layer", "common/minmax_layer.cl"}, + {"non_max_suppression", "common/nonmax.cl"}, + {"pad_layer_constant", "common/pad_layer.cl"}, + {"pad_layer_symmetric_reflect", "common/pad_layer.cl"}, + {"permute", "common/permute.cl"}, + {"pixelwise_mul_complex", "common/pixelwise_mul_float.cl"}, + {"pixelwise_mul_float", "common/pixelwise_mul_float.cl"}, + {"pixelwise_mul_int", "common/pixelwise_mul_int.cl"}, + {"pixelwise_mul_quantized", "common/pixelwise_mul_int.cl"}, + {"qlstm_layer_normalization", "common/qlstm_layer_normalization.cl"}, + {"quantization_layer", "common/quantization_layer.cl"}, + {"range", "common/range.cl"}, + {"range_quantized", "common/range.cl"}, + {"reduction_operation_x", "common/reduction_operation.cl"}, + {"reduction_operation_non_parallel_x", "common/reduction_operation.cl"}, + {"reduction_operation_y", "common/reduction_operation.cl"}, + {"reduction_operation_z", "common/reduction_operation.cl"}, + {"reduction_operation_w", "common/reduction_operation.cl"}, + {"reshape_layer", "common/reshape_layer.cl"}, + {"reshape_to_columns", "common/convolution_layer.cl"}, + {"reverse", "common/reverse.cl"}, + {"roi_align_layer", "common/roi_align_layer.cl"}, + {"roi_align_layer_quantized", "common/roi_align_layer_quantized.cl"}, + {"roi_pooling_layer", "common/roi_pooling_layer.cl"}, + {"select_same_rank", "common/select.cl"}, + {"select_different_rank_2", "common/select.cl"}, + {"select_different_rank_n", "common/select.cl"}, + {"softmax_layer_norm", "common/softmax_layer.cl"}, + {"softmax_layer_norm_quantized", "common/softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_quantized_serial", "common/softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_quantized_parallel", "common/softmax_layer_quantized.cl"}, + {"softmax_layer_max_shift_exp_sum_serial", "common/softmax_layer.cl"}, + {"softmax_layer_max_shift_exp_sum_parallel", "common/softmax_layer.cl"}, + {"stack_layer", "common/stack_layer.cl"}, + {"strided_slice", "common/slice_ops.cl"}, + {"tile", "common/tile.cl"}, + {"transpose", "common/transpose.cl"}, #ifdef ENABLE_NCHW_KERNELS - { "batch_to_space_nchw", "nchw/batch_to_space.cl" }, - { "batch_to_space_static_nchw", "nchw/batch_to_space.cl" }, - { "batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl" }, - { "channel_shuffle_nchw", "nchw/channel_shuffle.cl" }, - { "depth_to_space_nchw", "nchw/depth_to_space.cl" }, - { "dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl" }, - { "direct_convolution1x1", "nchw/direct_convolution1x1.cl" }, - { "direct_convolution_nchw", "nchw/direct_convolution.cl" }, + {"batch_to_space_nchw", "nchw/batch_to_space.cl"}, + {"batch_to_space_static_nchw", "nchw/batch_to_space.cl"}, + {"batchnormalization_layer_nchw", "nchw/batchnormalization_layer.cl"}, + {"channel_shuffle_nchw", "nchw/channel_shuffle.cl"}, + {"depth_to_space_nchw", "nchw/depth_to_space.cl"}, + {"dequantization_layer_per_channel_nchw", "nchw/dequantization_layer.cl"}, + {"direct_convolution1x1", "nchw/direct_convolution1x1.cl"}, + {"direct_convolution_nchw", "nchw/direct_convolution.cl"}, - { "im2col1x1_stridex1_nchw", "nchw/im2col.cl" }, - { "im2col3x3_nchw", "nchw/im2col.cl" }, - { "im2col5x5_nchw", "nchw/im2col.cl" }, - { "im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl" }, - { "im2col_generic_nchw", "nchw/im2col.cl" }, - { "im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl" }, - { "normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl" }, - { "normalization_layer_in_map_nchw", "nchw/normalization_layer.cl" }, - { "normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl" }, - { "pooling_layer_MxN_nchw", "nchw/pooling_layer.cl" }, - { "pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl" }, - { "prior_box_layer_nchw", "nchw/prior_box_layer.cl" }, - { "reorg_layer_nchw", "nchw/reorg_layer.cl" }, - { "scale_nearest_neighbour_nchw", "nchw/scale.cl" }, - { "scale_bilinear_nchw", "nchw/scale.cl" }, - { "space_to_batch_nchw", "nchw/space_to_batch.cl" }, - { "space_to_batch_static_nchw", "nchw/space_to_batch.cl" }, - { "space_to_depth_nchw", "nchw/space_to_depth.cl" }, - { "upsample_layer_nchw", "nchw/upsample_layer.cl" }, - { "winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl" }, - { "winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl" }, + {"im2col1x1_stridex1_nchw", "nchw/im2col.cl"}, + {"im2col3x3_nchw", "nchw/im2col.cl"}, + {"im2col5x5_nchw", "nchw/im2col.cl"}, + {"im2col11x11_padx0_pady0_nchw", "nchw/im2col.cl"}, + {"im2col_generic_nchw", "nchw/im2col.cl"}, + {"im2col_generic_padx0_pady0_nchw", "nchw/im2col.cl"}, + {"normalization_layer_cross_map_nchw", "nchw/normalization_layer.cl"}, + {"normalization_layer_in_map_nchw", "nchw/normalization_layer.cl"}, + {"normalize_planar_yuv_layer_nchw", "nchw/normalize_planar_yuv_layer.cl"}, + {"normalize_planar_yuv_layer_q8_nchw", "nchw/normalize_planar_yuv_layer_quantized.cl"}, + {"pooling_layer_MxN_nchw", "nchw/pooling_layer.cl"}, + {"pooling_layer_2_nchw_indices", "nchw/pooling_layer.cl"}, + {"prior_box_layer_nchw", "nchw/prior_box_layer.cl"}, + {"reorg_layer_nchw", "nchw/reorg_layer.cl"}, + {"scale_nearest_neighbour_nchw", "nchw/scale.cl"}, + {"scale_bilinear_nchw", "nchw/scale.cl"}, + {"space_to_batch_nchw", "nchw/space_to_batch.cl"}, + {"space_to_batch_static_nchw", "nchw/space_to_batch.cl"}, + {"space_to_depth_nchw", "nchw/space_to_depth.cl"}, + {"upsample_layer_nchw", "nchw/upsample_layer.cl"}, + {"winograd_filter_transform_2x2_3x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_2x1_3x1_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x2_1x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_3x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x1_3x1_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x3_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_5x5_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x1_5x1_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x5_nchw", "nchw/winograd_filter_transform.cl"}, + {"winograd_input_transform_2x2_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_2x2_3x3_stepz2_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_2x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_2x1_3x1_stepz2_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x2_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x2_1x3_stepz2_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_3x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x1_3x1_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x3_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_5x5_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_4x1_5x1_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x5_stepz1_nchw", "nchw/winograd_input_transform.cl"}, + {"winograd_output_transform_2x2_3x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_2x1_3x1_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_1x2_1x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_3x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x1_3x1_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x3_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_5x5_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_4x1_5x1_nchw", "nchw/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x5_nchw", "nchw/winograd_output_transform.cl"}, #endif /* ENABLE_NCHW_KERNELS */ #ifdef ENABLE_NHWC_KERNELS - { "batch_to_space_nhwc", "nhwc/batch_to_space.cl" }, - { "batch_to_space_static_nhwc", "nhwc/batch_to_space.cl" }, - { "batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl" }, - { "channel_shuffle_nhwc", "nhwc/channel_shuffle.cl" }, - { "depth_to_space_nhwc", "nhwc/depth_to_space.cl" }, - { "dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl" }, - { "dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl" }, - { "dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl" }, - { "direct_convolution_nhwc", "nhwc/direct_convolution.cl" }, - { "direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl" }, - { "im2col3x3_nhwc", "nhwc/im2col.cl" }, - { "im2col9x9_nhwc", "nhwc/im2col.cl" }, - { "im2col_generic_nhwc", "nhwc/im2col.cl" }, - { "indirect_convolution_nhwc", "nhwc/indirect_convolution.cl" }, - { "indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl" }, - { "normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl" }, - { "normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl" }, - { "normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl" }, - { "normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl" }, - { "pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl" }, - { "pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl" }, - { "pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl" }, - { "pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl" }, - { "pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl" }, - { "reorg_layer_nhwc", "nhwc/reorg_layer.cl" }, - { "scale_nearest_neighbour_nhwc", "nhwc/scale.cl" }, - { "scale_bilinear_nhwc", "nhwc/scale.cl" }, - { "space_to_batch_nhwc", "nhwc/space_to_batch.cl" }, - { "space_to_batch_static_nhwc", "nhwc/space_to_batch.cl" }, - { "space_to_depth_nhwc", "nhwc/space_to_depth.cl" }, - { "transposed_convolution_nhwc", "nhwc/transposed_convolution.cl" }, - { "upsample_layer_nhwc", "nhwc/upsample_layer.cl" }, - { "winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl" }, - { "winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl" }, - { "winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl" }, - { "winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl" }, + {"batch_to_space_nhwc", "nhwc/batch_to_space.cl"}, + {"batch_to_space_static_nhwc", "nhwc/batch_to_space.cl"}, + {"batchnormalization_layer_nhwc", "nhwc/batchnormalization_layer.cl"}, + {"channel_shuffle_nhwc", "nhwc/channel_shuffle.cl"}, + {"depth_to_space_nhwc", "nhwc/depth_to_space.cl"}, + {"dequantization_layer_per_channel_nhwc", "nhwc/dequantization_layer.cl"}, + {"dwc_native_fp_nhwc", "nhwc/dwc_native_fp_nhwc.cl"}, + {"dwc_native_quantized_nhwc", "nhwc/dwc_native_quantized_nhwc.cl"}, + {"direct_convolution_nhwc", "nhwc/direct_convolution.cl"}, + {"direct_convolution3d_ndhwc", "nhwc/direct_convolution3d.cl"}, + {"im2col3x3_nhwc", "nhwc/im2col.cl"}, + {"im2col9x9_nhwc", "nhwc/im2col.cl"}, + {"im2col_generic_nhwc", "nhwc/im2col.cl"}, + {"indirect_convolution_nhwc", "nhwc/indirect_convolution.cl"}, + {"indirect_convolution_address_precalculation", "nhwc/indirect_convolution.cl"}, + {"normalization_layer_cross_map_nhwc", "nhwc/normalization_layer.cl"}, + {"normalization_layer_in_map_nhwc", "nhwc/normalization_layer.cl"}, + {"normalize_planar_yuv_layer_nhwc", "nhwc/normalize_planar_yuv_layer.cl"}, + {"normalize_planar_yuv_layer_q8_nhwc", "nhwc/normalize_planar_yuv_layer_quantized.cl"}, + {"pooling_layer_MxN_nhwc", "nhwc/pooling_layer.cl"}, + {"pooling_layer_2x2_nhwc", "nhwc/pooling_layer.cl"}, + {"pooling_layer_MxN_quantized_nhwc", "nhwc/pooling_layer_quantized.cl"}, + {"pooling_3d_layer_MxN_ndhwc", "nhwc/pooling_3d_layer.cl"}, + {"pooling_3d_layer_MxN_ndhwc_quantized", "nhwc/pooling_3d_layer_quantized.cl"}, + {"reorg_layer_nhwc", "nhwc/reorg_layer.cl"}, + {"scale_nearest_neighbour_nhwc", "nhwc/scale.cl"}, + {"scale_bilinear_nhwc", "nhwc/scale.cl"}, + {"space_to_batch_nhwc", "nhwc/space_to_batch.cl"}, + {"space_to_batch_static_nhwc", "nhwc/space_to_batch.cl"}, + {"space_to_depth_nhwc", "nhwc/space_to_depth.cl"}, + {"transposed_convolution_nhwc", "nhwc/transposed_convolution.cl"}, + {"upsample_layer_nhwc", "nhwc/upsample_layer.cl"}, + {"winograd_filter_transform_4x1_3x1_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x3_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_3x3_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x4_5x5_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_4x1_5x1_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x4_1x5_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_2x2_7x7_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_2x1_7x1_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_filter_transform_1x2_1x7_nhwc", "nhwc/winograd_filter_transform.cl"}, + {"winograd_input_transform_4x1_3x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_3x3_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_4x4_5x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_4x1_5x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_1x4_1x5_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_2x2_7x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_2x1_7x1_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_input_transform_1x2_1x7_stepz1_nhwc", "nhwc/winograd_input_transform.cl"}, + {"winograd_output_transform_4x1_3x1_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x3_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_3x3_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_4x4_5x5_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_4x1_5x1_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_1x4_1x5_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_2x2_7x7_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_2x1_7x1_nhwc", "nhwc/winograd_output_transform.cl"}, + {"winograd_output_transform_1x2_1x7_nhwc", "nhwc/winograd_output_transform.cl"}, #endif /* ENABLE_NHWC_KERNELS */ }; -const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = -{ +const std::map<std::string, std::string> ClKernelLibrary::_program_source_map = { #ifdef EMBEDDED_KERNELS { "activation_float_helpers.h", @@ -996,7 +985,7 @@ std::string ClKernelLibrary::program_name(const std::string &kernel_name) const // Find which program contains the kernel auto kernel_program_it = _kernel_program_map.find(kernel_name); - if(_kernel_program_map.end() == kernel_program_it) + if (_kernel_program_map.end() == kernel_program_it) { ARM_COMPUTE_ERROR_VAR("Kernel %s not found in the CLKernelLibrary", kernel_name.c_str()); } @@ -1022,14 +1011,14 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr #ifdef EMBEDDED_KERNELS #ifdef ARM_COMPUTE_COMPRESSED_KERNELS const auto inflatted_program_source_it = _decompressed_source_map.find(program_name); - if(inflatted_program_source_it != _decompressed_source_map.end()) + if (inflatted_program_source_it != _decompressed_source_map.end()) { - return ClProgramInfo{ inflatted_program_source_it->second, false }; + return ClProgramInfo{inflatted_program_source_it->second, false}; } #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ const auto program_source_it = _program_source_map.find(program_name); - if(program_source_it == _program_source_map.end()) + if (program_source_it == _program_source_map.end()) { ARM_COMPUTE_ERROR_VAR("Embedded program for %s does not exist.", program_name.c_str()); } @@ -1042,7 +1031,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr program_source = std::move(decompressed_program_source); #endif /* ARM_COMPUTE_COMPRESSED_KERNELS */ - return ClProgramInfo{ program_source, false }; + return ClProgramInfo{program_source, false}; #else /* EMBEDDED_KERNELS */ // Check for binary std::string source_name = _kernel_path + program_name; @@ -1050,12 +1039,12 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr std::string program_source{}; bool is_binary = false; - if(std::ifstream(binary_name).is_open()) + if (std::ifstream(binary_name).is_open()) { program_source = read_file(binary_name, true); is_binary = true; } - else if(std::ifstream(source_name).is_open()) + else if (std::ifstream(source_name).is_open()) { program_source = read_file(source_name, false); } @@ -1064,7 +1053,7 @@ ClKernelLibrary::ClProgramInfo ClKernelLibrary::program(const std::string &progr ARM_COMPUTE_ERROR_VAR("Kernel file %s does not exist.", source_name.c_str()); } - return ClProgramInfo{ program_source, is_binary }; + return ClProgramInfo{program_source, is_binary}; #endif /* EMBEDDED_KERNELS */ } } // namespace opencl diff --git a/src/gpu/cl/ClKernelLibrary.h b/src/gpu/cl/ClKernelLibrary.h index 42bec95032..cd1d689199 100644 --- a/src/gpu/cl/ClKernelLibrary.h +++ b/src/gpu/cl/ClKernelLibrary.h @@ -52,8 +52,8 @@ public: /** Structure to encapsulte program related information */ struct ClProgramInfo { - std::string program{}; /**< Program raw string */ - bool is_binary{ false }; /**< Flag that indicates if is in binary format */ + std::string program{}; /**< Program raw string */ + bool is_binary{false}; /**< Flag that indicates if is in binary format */ }; public: @@ -84,10 +84,12 @@ public: std::string program_name(const std::string &kernel_name) const; private: - std::string _kernel_path{}; /**< Path to the kernels folder. */ - mutable std::map<std::string, std::string> _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */ - static const std::map<std::string, std::string> _kernel_program_map; /**< Map that associates kernel names with programs. */ - static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs. + std::string _kernel_path{}; /**< Path to the kernels folder. */ + mutable std::map<std::string, std::string> + _decompressed_source_map{}; /**< Map holding the decompressed files when compression is used */ + static const std::map<std::string, std::string> + _kernel_program_map; /**< Map that associates kernel names with programs. */ + static const std::map<std::string, std::string> _program_source_map; /**< Contains sources for all programs. Used for compile-time kernel inclusion. >*/ }; } // namespace opencl diff --git a/src/gpu/cl/ClQueue.cpp b/src/gpu/cl/ClQueue.cpp index 2123adcf39..0cb7af5b61 100644 --- a/src/gpu/cl/ClQueue.cpp +++ b/src/gpu/cl/ClQueue.cpp @@ -36,7 +36,7 @@ namespace { CLTunerMode map_tuner_mode(AclTuningMode mode) { - switch(mode) + switch (mode) { case AclRapid: return CLTunerMode::RAPID; @@ -55,7 +55,7 @@ CLTunerMode map_tuner_mode(AclTuningMode mode) std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options) { - if(options == nullptr || options->mode == AclTuningModeNone) + if (options == nullptr || options->mode == AclTuningModeNone) { return nullptr; } @@ -68,8 +68,7 @@ std::unique_ptr<CLTuner> populate_tuner(const AclQueueOptions *options) } } // namespace -ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) - : IQueue(ctx), _tuner(nullptr) +ClQueue::ClQueue(IContext *ctx, const AclQueueOptions *options) : IQueue(ctx), _tuner(nullptr) { _tuner = populate_tuner(options); } diff --git a/src/gpu/cl/ClQueue.h b/src/gpu/cl/ClQueue.h index b16a0f4e83..09ffb06cf3 100644 --- a/src/gpu/cl/ClQueue.h +++ b/src/gpu/cl/ClQueue.h @@ -24,10 +24,10 @@ #ifndef SRC_GPU_CLQUEUE_H #define SRC_GPU_CLQUEUE_H -#include "src/common/IQueue.h" - #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/IQueue.h" + #include <memory> namespace arm_compute diff --git a/src/gpu/cl/ClTensor.cpp b/src/gpu/cl/ClTensor.cpp index 0df07813e3..27422a4130 100644 --- a/src/gpu/cl/ClTensor.cpp +++ b/src/gpu/cl/ClTensor.cpp @@ -31,8 +31,7 @@ namespace gpu { namespace opencl { -ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) - : ITensorV2(ctx), _legacy_tensor() +ClTensor::ClTensor(IContext *ctx, const AclTensorDescriptor &desc) : ITensorV2(ctx), _legacy_tensor() { ARM_COMPUTE_ASSERT((ctx != nullptr) && (ctx->type() == Target::GpuOcl)); _legacy_tensor = std::make_unique<CLTensor>(); @@ -43,7 +42,7 @@ void *ClTensor::map() { ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); - if(_legacy_tensor == nullptr) + if (_legacy_tensor == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:map]: Backing tensor does not exist!"); return nullptr; @@ -57,7 +56,7 @@ StatusCode ClTensor::unmap() { ARM_COMPUTE_ASSERT(_legacy_tensor.get() != nullptr); - if(_legacy_tensor == nullptr) + if (_legacy_tensor == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("[ClTensor:unmap]: Backing tensor does not exist!"); return StatusCode::RuntimeError; diff --git a/src/gpu/cl/ClTensor.h b/src/gpu/cl/ClTensor.h index 99d228c0b8..70184cd4bd 100644 --- a/src/gpu/cl/ClTensor.h +++ b/src/gpu/cl/ClTensor.h @@ -24,10 +24,10 @@ #ifndef SRC_GPU_CLTENSOR_H #define SRC_GPU_CLTENSOR_H -#include "src/common/ITensorV2.h" - #include "arm_compute/runtime/CL/CLTensor.h" +#include "src/common/ITensorV2.h" + namespace arm_compute { namespace gpu @@ -54,7 +54,7 @@ public: void *map() override; StatusCode unmap() override; arm_compute::ITensor *tensor() const override; - StatusCode import(void *handle, ImportMemoryType type) override; + StatusCode import(void *handle, ImportMemoryType type) override; private: std::unique_ptr<CLTensor> _legacy_tensor; @@ -63,4 +63,4 @@ private: } // namespace gpu } // namespace arm_compute -#endif /* SRC_GPU_CLTENSOR_H */
\ No newline at end of file +#endif /* SRC_GPU_CLTENSOR_H */ diff --git a/src/gpu/cl/IClKernel.h b/src/gpu/cl/IClKernel.h index 52ea3c9183..4f07e9ad68 100644 --- a/src/gpu/cl/IClKernel.h +++ b/src/gpu/cl/IClKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_ICL_KERNEL_H #include "arm_compute/core/ITensorInfo.h" + #include "src/core/CL/ICLKernel.h" namespace arm_compute diff --git a/src/gpu/cl/kernels/ClActivationKernel.cpp b/src/gpu/cl/kernels/ClActivationKernel.cpp index ab1543729f..a85296f7cd 100644 --- a/src/gpu/cl/kernels/ClActivationKernel.cpp +++ b/src/gpu/cl/kernels/ClActivationKernel.cpp @@ -28,14 +28,14 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" +#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" #include <set> @@ -51,36 +51,47 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16, DataType::F16, DataType::F32); - static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = - { - ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LOGISTIC, - ActivationLayerInfo::ActivationFunction::TANH, - ActivationLayerInfo::ActivationFunction::HARD_SWISH, + static std::set<ActivationLayerInfo::ActivationFunction> quantized_supported_activations = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, ActivationLayerInfo::ActivationFunction::LOGISTIC, + ActivationLayerInfo::ActivationFunction::TANH, ActivationLayerInfo::ActivationFunction::HARD_SWISH, ActivationLayerInfo::ActivationFunction::LEAKY_RELU, }; - const DataType data_type = src->data_type(); - const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); - const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && (quantized_supported_activations.count(f_act) == 0), - "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and lower/upper bounded relu are supported"); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 128))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 32768.f, 0))); - - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::TANH) && (oq_info != QuantizationInfo(1.f / 128.f, 0))); - ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && (oq_info != QuantizationInfo(1.f / 256.f, -128))); + const DataType data_type = src->data_type(); + const QuantizationInfo &oq_info = (dst != nullptr) ? dst->quantization_info() : src->quantization_info(); + const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_data_type_quantized(data_type) && + (quantized_supported_activations.count(f_act) == 0), + "For Quantized data type only hard swish, leaky relu, tanh, logistic, relu and " + "lower/upper bounded relu are supported"); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 128))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8 && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized_symmetric(data_type) && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 32768.f, 0))); + + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::TANH) && + (oq_info != QuantizationInfo(1.f / 128.f, 0))); + ARM_COMPUTE_RETURN_ERROR_ON(data_type == DataType::QASYMM8_SIGNED && + (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) && + (oq_info != QuantizationInfo(1.f / 256.f, -128))); // Checks performed when destination is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); @@ -95,15 +106,18 @@ ClActivationKernel::ClActivationKernel() _type = CLKernelType::ELEMENTWISE; } -void ClActivationKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info) +void ClActivationKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + ActivationLayerInfo act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _run_in_place = (dst == nullptr) || (dst == src); - if(dst != nullptr) + if (dst != nullptr) { // Destination auto inizialitation if not yet initialized auto_init_if_empty(*dst, *src->clone()); @@ -119,11 +133,10 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen const ActivationLayerInfo::ActivationFunction f_act = act_info.activation(); const bool is_quantized = is_data_type_quantized(dt); - const bool perform_activation_in_float = - (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) - || (f_act == ActivationLayerInfo::ActivationFunction::TANH) - || (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) - || (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU); + const bool perform_activation_in_float = (f_act == ActivationLayerInfo::ActivationFunction::LOGISTIC) || + (f_act == ActivationLayerInfo::ActivationFunction::TANH) || + (f_act == ActivationLayerInfo::ActivationFunction::HARD_SWISH) || + (f_act == ActivationLayerInfo::ActivationFunction::LEAKY_RELU); // Set build options CLBuildOptions build_opts; @@ -132,22 +145,23 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen build_opts.add_option("-DACT=" + lower_string(string_from_activation_func(f_act))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dt)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); std::string kernel_name = std::string("activation_layer"); // Set quantization info build options - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); - if(!perform_activation_in_float) + if (!perform_activation_in_float) { int a_const_int = 0; int b_const_int = 0; // Create quantized version of constants a, b if needed - switch(dt) + switch (dt) { case DataType::QASYMM8: { @@ -180,22 +194,25 @@ void ClActivationKernel::configure(const ClCompileContext &compile_context, ITen } // Quantized value of 0 corresponds to the offset o1 - build_opts.add_option(("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0"))); + build_opts.add_option( + ("-DCONST_0=" + (is_data_type_quantized_asymmetric(dt) ? support::cpp11::to_string(iq_info.offset) : "0"))); build_opts.add_option(("-DS1_VAL=" + float_to_string_with_full_precision(iq_info.scale))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DO1_VAL=" + support::cpp11::to_string(iq_info.offset)); // Set correct kernel name kernel_name += perform_activation_in_float ? std::string("_quant_f32") : std::string("_quant"); // Set scale and offset of the source and destination if they have different quantization info - if(dst != nullptr) + if (dst != nullptr) { const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); - if(iq_info != oq_info) + if (iq_info != oq_info) { build_opts.add_option(("-DS2_VAL=" + float_to_string_with_full_precision(oq_info.scale))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DO2_VAL=" + support::cpp11::to_string(oq_info.offset)); } } } @@ -235,8 +252,9 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON(_run_in_place && src != dst); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -246,13 +264,12 @@ void ClActivationKernel::run_op(ITensorPack &tensors, const Window &window, ::cl { unsigned int idx = 0; add_3D_tensor_argument(idx, src, slice); - if(!_run_in_place) + if (!_run_in_place) { add_3D_tensor_argument(idx, dst, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClActivationKernel.h b/src/gpu/cl/kernels/ClActivationKernel.h index 82e35b6104..ab7607bb82 100644 --- a/src/gpu/cl/kernels/ClActivationKernel.h +++ b/src/gpu/cl/kernels/ClActivationKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ACTIVATION_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -51,7 +52,10 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src * @param[in] act_info Activation layer information. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, ActivationLayerInfo act_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + ActivationLayerInfo act_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClActivationKernel::configure() @@ -64,7 +68,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - bool _run_in_place{ false }; + bool _run_in_place{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp index 3d8ecf1fcc..a853f6bc1b 100644 --- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -66,12 +66,15 @@ ClBatchConcatenateKernel::ClBatchConcatenateKernel() _type = CLKernelType::ELEMENTWISE; } -void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst) +void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int batch_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, batch_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _batch_offset = batch_offset; @@ -81,8 +84,9 @@ void ClBatchConcatenateKernel::configure(const CLCompileContext &compile_context CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -136,8 +140,9 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_3D(); @@ -152,9 +157,8 @@ void ClBatchConcatenateKernel::run_op(ITensorPack &tensors, const Window &window add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } -} // namespace opencl } // namespace kernels +} // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h index f6b7c0ed09..549576b628 100644 --- a/src/gpu/cl/kernels/ClBatchConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClBatchConcatenateKernel.h @@ -53,7 +53,8 @@ public: * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int batch_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClBatchConcatenateKernel::configure() @@ -66,7 +67,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - unsigned int _batch_offset{ 0 }; + unsigned int _batch_offset{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClCastKernel.cpp b/src/gpu/cl/kernels/ClCastKernel.cpp index f621ad62d7..9ca35634f4 100644 --- a/src/gpu/cl/kernels/ClCastKernel.cpp +++ b/src/gpu/cl/kernels/ClCastKernel.cpp @@ -32,10 +32,10 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -52,20 +52,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Conver ARM_COMPUTE_UNUSED(policy); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src == dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32, DataType::S64, DataType::U64); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, - 1, - DataType::U8, DataType::S8, DataType::QASYMM8, DataType::S16, - DataType::U16, DataType::U32, DataType::S32, DataType::F16, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, + DataType::S16, DataType::U16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32, DataType::S64, DataType::U64); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::S8, DataType::QASYMM8, + DataType::S16, DataType::U16, DataType::U32, DataType::S32, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_type() == dst->data_type(), "src and dst data types must be different"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } @@ -79,7 +76,10 @@ ClCastKernel::ClCastKernel() _type = CLKernelType::ELEMENTWISE; } -void ClCastKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +void ClCastKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + ConvertPolicy policy) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -88,7 +88,7 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, policy)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Get data sizes const size_t src_size = data_size_from_type(src->data_type()); @@ -100,12 +100,14 @@ void ClCastKernel::configure(const CLCompileContext &compile_context, const ITen // Set build options CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); // Conversions from float always SATURATE as out-of-bounds conversion from float->integer is implementation defined build_opts.add_option_if(is_data_type_float(src->data_type()) || policy == ConvertPolicy::SATURATE, "-DSATURATE"); - build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), "-DIS_DATA_TYPE_FLOAT"); + build_opts.add_option_if(is_data_type_float(src->data_type()) || is_data_type_float(dst->data_type()), + "-DIS_DATA_TYPE_FLOAT"); build_opts.add_option_if(is_data_type_quantized(src->data_type()), "-DIS_DATA_TYPE_QUANTIZED"); // Create kernel @@ -148,8 +150,9 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -162,8 +165,7 @@ void ClCastKernel::run_op(ITensorPack &tensors, const Window &window, ::cl::Comm add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClCastKernel.h b/src/gpu/cl/kernels/ClCastKernel.h index a021b3c78c..07b0b61443 100644 --- a/src/gpu/cl/kernels/ClCastKernel.h +++ b/src/gpu/cl/kernels/ClCastKernel.h @@ -64,7 +64,8 @@ public: * @param[out] dst The destination tensor. Data types supported: U8/S8/QASYMM8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + void + configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCastKernel::configure() diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.cpp b/src/gpu/cl/kernels/ClCol2ImKernel.cpp index 3316742912..9972e07f05 100644 --- a/src/gpu/cl/kernels/ClCol2ImKernel.cpp +++ b/src/gpu/cl/kernels/ClCol2ImKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -47,29 +48,38 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); // Checks performed when output is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), compute_col2im_shape(*src, convolved_dims, true, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, "Col2Im output's data layout must always be NCHW"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_layout() != DataLayout::NCHW, + "Col2Im output's data layout must always be NCHW"); } return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)).set_data_layout(DataLayout::NCHW)); + auto_init_if_empty(*dst, src->clone() + ->set_tensor_shape(compute_col2im_shape(*src, convolved_dims, true, num_groups)) + .set_data_layout(DataLayout::NCHW)); constexpr unsigned int num_elems_read_per_iteration = 8; @@ -80,18 +90,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso AccessWindowHorizontal input_access(src, 0, num_elems_read_per_iteration); bool window_changed = update_window_and_padding(win, input_access); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace -ClCol2ImKernel::ClCol2ImKernel() - : _convolved_dims() +ClCol2ImKernel::ClCol2ImKernel() : _convolved_dims() { _type = CLKernelType::ELEMENTWISE; } -void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +void ClCol2ImKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -132,11 +146,15 @@ void ClCol2ImKernel::configure(const CLCompileContext &compile_context, ITensorI _config_id += support::cpp11::to_string(dst->dimension(1)); } -Status ClCol2ImKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups) +Status ClCol2ImKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, convolved_dims, num_groups)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), dst->clone().get(), convolved_dims, num_groups).first); return Status{}; } @@ -168,8 +186,7 @@ void ClCol2ImKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm add_3D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out)); + } while (collapsed.slide_window_slice_3D(slice) && collapsed_out.slide_window_slice_4D(slice_out)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClCol2ImKernel.h b/src/gpu/cl/kernels/ClCol2ImKernel.h index e19b7c8e16..34194aba01 100644 --- a/src/gpu/cl/kernels/ClCol2ImKernel.h +++ b/src/gpu/cl/kernels/ClCol2ImKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_COL2IM_KERNEL_H #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -68,14 +69,19 @@ public: * @param[in] convolved_dims Output convolved dimensions. * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &convolved_dims, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to ClCol2ImKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); + static Status + validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &convolved_dims, unsigned int num_groups = 1); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp index 716dec1f30..85d3c3939c 100644 --- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp +++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/Utils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -45,17 +46,21 @@ ClConvertFullyConnectedWeightsKernel::ClConvertFullyConnectedWeightsKernel() _type = CLKernelType::ELEMENTWISE; } -void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, - DataLayout data_layout) +void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto initialisation if not yet initialized auto_init_if_empty(*dst, *src->clone()); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); - ARM_COMPUTE_ERROR_THROW_ON(ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout)); + ARM_COMPUTE_ERROR_THROW_ON( + ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout)); const DataLayout src_data_layout = (data_layout == DataLayout::NCHW) ? DataLayout::NHWC : DataLayout::NCHW; @@ -85,8 +90,10 @@ void ClConvertFullyConnectedWeightsKernel::configure(const CLCompileContext &com ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, - DataLayout data_layout) +Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); @@ -96,7 +103,7 @@ Status ClConvertFullyConnectedWeightsKernel::validate(const ITensorInfo *src, co ARM_COMPUTE_RETURN_ERROR_ON(data_layout == DataLayout::UNKNOWN); // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -110,8 +117,9 @@ void ClConvertFullyConnectedWeightsKernel::run_op(ITensorPack &tensors, const Wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); unsigned int idx = 0; diff --git a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h index 16000e82f6..0ddb54561a 100644 --- a/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h +++ b/src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h @@ -55,14 +55,21 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConvertFullyConnectedWeightsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClCopyKernel.cpp b/src/gpu/cl/kernels/ClCopyKernel.cpp index 4719448819..c80ef664f5 100644 --- a/src/gpu/cl/kernels/ClCopyKernel.cpp +++ b/src/gpu/cl/kernels/ClCopyKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,11 +51,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, Window ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); // Validate dst if initialized - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); - if(dst_window == nullptr) + if (dst_window == nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(src->tensor_shape(), dst->tensor_shape()); } @@ -74,12 +75,15 @@ ClCopyKernel::ClCopyKernel() _type = CLKernelType::ELEMENTWISE; } -void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window) +void ClCopyKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, dst_window)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Create kernel CLBuildOptions build_opts; @@ -93,7 +97,7 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen const Window win_config = calculate_max_window(*src, Steps(vec_size_x)); - if(dst_window != nullptr) + if (dst_window != nullptr) { _has_dst_window = true; _dst_window = Window(*dst_window); @@ -101,9 +105,11 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen const int vec_size_x_leftover = width_x % vec_size_x; const bool multi_access_x = width_x >= static_cast<int32_t>(vec_size_x); - if(multi_access_x) + if (multi_access_x) { - _dst_window.set(Window::DimX, Window::Dimension(dst_window->x().start(), ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x)); + _dst_window.set(Window::DimX, + Window::Dimension(dst_window->x().start(), + ceil_to_multiple(dst_window->x().end(), vec_size_x), vec_size_x)); } build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_x_leftover)); @@ -127,7 +133,8 @@ void ClCopyKernel::configure(const CLCompileContext &compile_context, const ITen ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window) +Status +ClCopyKernel::validate(const arm_compute::ITensorInfo *src, const arm_compute::ITensorInfo *dst, Window *dst_window) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, dst_window)); @@ -139,12 +146,13 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window slice; - if(_has_dst_window) + if (_has_dst_window) { slice = window.first_slice_window_3D(); Window out_slice = _dst_window.first_slice_window_3D(); @@ -154,8 +162,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, out_slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice)); + } while (window.slide_window_slice_3D(slice) && _dst_window.slide_window_slice_3D(out_slice)); } else { @@ -167,8 +174,7 @@ void ClCopyKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } } // namespace kernels diff --git a/src/gpu/cl/kernels/ClCopyKernel.h b/src/gpu/cl/kernels/ClCopyKernel.h index 63fd806586..f915bf672d 100644 --- a/src/gpu/cl/kernels/ClCopyKernel.h +++ b/src/gpu/cl/kernels/ClCopyKernel.h @@ -47,7 +47,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCopyKernel::configure() diff --git a/src/gpu/cl/kernels/ClCropKernel.cpp b/src/gpu/cl/kernels/ClCropKernel.cpp index 87ad6b49d9..0c503e13fc 100644 --- a/src/gpu/cl/kernels/ClCropKernel.cpp +++ b/src/gpu/cl/kernels/ClCropKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" + #include "src/core/CPP/Validate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -46,8 +47,14 @@ ClCropKernel::ClCropKernel() _type = CLKernelType::ELEMENTWISE; } -void ClCropKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, - float extrapolation_value, Window *dst_window) +void ClCropKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, start, end, batch_index, extrapolation_value, dst_window)); @@ -60,7 +67,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen // Create and update the window (if needed) Window win = calculate_max_window(*dst); - if(dst_window != nullptr) + if (dst_window != nullptr) { ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *dst_window); win = *dst_window; @@ -70,7 +77,7 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen const bool multi_access_x = dst_width_x >= vec_size_x; const bool remainder_x = dst_width_x % vec_size_x > 0; - if(multi_access_x) + if (multi_access_x) { win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); @@ -81,13 +88,21 @@ void ClCropKernel::configure(const CLCompileContext &compile_context, const ITen CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x && remainder_x, + "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0))); build_opts.add_option_if(start.x > end.x, "-DWIDTH_FLIPPED="); build_opts.add_option_if(start.y > end.y, "-DHEIGHT_FLIPPED="); _kernel = create_kernel(compile_context, "crop_tensor", build_opts.options()); } -Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status ClCropKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_UNUSED(extrapolation_value, dst_window); ARM_COMPUTE_RETURN_ERROR_ON_CPU_F16_UNSUPPORTED(src); @@ -95,14 +110,15 @@ Status ClCropKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, Co ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON(src->tensor_shape().num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON(start.x < 0 || start.y < 0 || end.x < 0 || end.y < 0); - ARM_COMPUTE_RETURN_ERROR_ON(start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) - || end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON( + start.x >= static_cast<int32_t>(src->dimension(1)) || start.y >= static_cast<int32_t>(src->dimension(2)) || + end.x >= static_cast<int32_t>(src->dimension(1)) || end.y >= static_cast<int32_t>(src->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(batch_index >= src->dimension(3)); - if(dst_window != nullptr) + if (dst_window != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(dst_window->x().step() != 1); } - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(dst, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -116,12 +132,15 @@ void ClCropKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window in_slice = Window(); in_slice.use_tensor_dimensions(src->info()->tensor_shape()); - in_slice.set(Window::DimX, Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), window.x().step())); + in_slice.set(Window::DimX, + Window::Dimension(in_slice.x().start(), ceil_to_multiple(in_slice.x().end(), window.x().step()), + window.x().step())); in_slice.set(3, Window::Dimension(_batch_index, _batch_index + 1, 1)); unsigned int idx = 0; diff --git a/src/gpu/cl/kernels/ClCropKernel.h b/src/gpu/cl/kernels/ClCropKernel.h index 2f166e184c..506262608c 100644 --- a/src/gpu/cl/kernels/ClCropKernel.h +++ b/src/gpu/cl/kernels/ClCropKernel.h @@ -53,16 +53,27 @@ public: * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCropKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp index a05cd1321e..ec44d88f01 100644 --- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -48,7 +48,8 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimX) != dst->dimension(Window::DimX)); @@ -60,18 +61,20 @@ Status validate_arguments(const ITensorInfo *src, unsigned int depth_offset, con } } // namespace -ClDepthConcatenateKernel::ClDepthConcatenateKernel() - : _depth_offset(0) +ClDepthConcatenateKernel::ClDepthConcatenateKernel() : _depth_offset(0) { _type = CLKernelType::ELEMENTWISE; } -void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst) +void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int depth_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, depth_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _depth_offset = depth_offset; @@ -81,8 +84,9 @@ void ClDepthConcatenateKernel::configure(const CLCompileContext &compile_context CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -122,8 +126,9 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_3D(); @@ -138,8 +143,7 @@ void ClDepthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h index 4739677f3b..539f010303 100644 --- a/src/gpu/cl/kernels/ClDepthConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClDepthConcatenateKernel.h @@ -53,7 +53,8 @@ public: * @note: The gaps between the two lowest dimensions of src and dst need to be divisible by 2. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int depth_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClDepthConcatenateKernel::configure() diff --git a/src/gpu/cl/kernels/ClDequantizeKernel.cpp b/src/gpu/cl/kernels/ClDequantizeKernel.cpp index 756cd56a8b..53429ab1aa 100644 --- a/src/gpu/cl/kernels/ClDequantizeKernel.cpp +++ b/src/gpu/cl/kernels/ClDequantizeKernel.cpp @@ -34,7 +34,6 @@ #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -49,9 +48,11 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8_PER_CHANNEL, DataType::QSYMM8, + DataType::QSYMM16); - if(dst->tensor_shape().total_size() > 0) + if (dst->tensor_shape().total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32); @@ -74,7 +75,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen // Output tensor auto initialization if not yet initialized auto_init_if_empty(*dst, src->tensor_shape(), 1, DataType::F32); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); @@ -87,7 +88,7 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen // Create kernel CLBuildOptions build_opts; - if(!is_quantized_per_channel) + if (!is_quantized_per_channel) { const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); const int qoffset = is_data_type_quantized_asymmetric(src->data_type()) ? qinfo.offset : 0; @@ -103,16 +104,18 @@ void ClDequantizeKernel::configure(const CLCompileContext &compile_context, ITen build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string( + std::max<int>(output_width_x - vec_size_x, 0))); // Create kernel name _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window Window win = calculate_max_window(*dst); - if(multi_access_x) + if (multi_access_x) { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -136,10 +139,11 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl:: const bool is_quantized_per_channel = is_data_type_quantized_per_channel(src->info()->data_type()); // Collapse windo - Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3); + Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) + : window.collapse_if_possible(ICLKernel::window(), 3); Window slice = new_window.first_slice_window_3D(); - if(is_quantized_per_channel) + if (is_quantized_per_channel) { unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters _kernel.setArg(idx++, src->quantization().scale->cl_buffer()); @@ -151,8 +155,7 @@ void ClDequantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl:: add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(new_window.slide_window_slice_3D(slice)); + } while (new_window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp index 7ad398412a..7cf1958c1b 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.cpp @@ -23,17 +23,18 @@ */ #include "src/gpu/cl/kernels/ClDirectConv2dKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensor.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/PixelValue.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" @@ -51,11 +52,17 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8_SIGNED, DataType::QASYMM8, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); const DataLayout data_layout = src->data_layout(); @@ -63,41 +70,56 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co const int height_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); const int channel_idx = get_data_layout_dimension_index(data_layout, DataLayoutDimension::CHANNEL); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, "Export to CLImage is not supported for the input tensor"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, "Export to CLImage is not supported for the output tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_input_to_cl_image == true, + "Export to CLImage is not supported for the input tensor"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.export_output_to_cl_image == true, + "Export to CLImage is not supported for the output tensor"); - if(data_layout == DataLayout::NCHW) + if (data_layout == DataLayout::NCHW) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), "Weights should have same width and height"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, "Strides larger than 3 not supported for 1x1 convolution."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || weights->dimension(width_idx) == 9) && std::get<0>(conv_info.stride()) > 2, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != weights->dimension(height_idx), + "Weights should have same width and height"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 1) && std::get<0>(conv_info.stride()) > 3, + "Strides larger than 3 not supported for 1x1 convolution."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((weights->dimension(width_idx) == 3 || weights->dimension(width_idx) == 5 || + weights->dimension(width_idx) == 9) && + std::get<0>(conv_info.stride()) > 2, "Strides larger than 2 not supported for 3x3, 5x5, 9x9 convolution."); ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled(), "Fused activation is not supported for NCHW layout"); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, - "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && + weights->dimension(width_idx) != 5 && weights->dimension(width_idx) != 9, + "Kernel sizes other than 1x1, 3x3, 5x5 or 9x9 are not supported with quantized data types"); } else { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && weights->dimension(width_idx) != 5, - "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + weights->dimension(width_idx) != 1 && weights->dimension(width_idx) != 3 && + weights->dimension(width_idx) != 5, + "Kernel sizes other than 1x1, 3x3 or 5x5 are not supported with float data types"); } } - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), "Fused activation in NHWC is only supported for floating point."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(act_info.enabled() && !is_data_type_float(src->data_type()), + "Fused activation in NHWC is only supported for floating point."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, + "M0 can only be greater than 0 and less than or equal to 8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && + desc.n0 != 16, "N0 can only be: 1, 2, 3, 4, 8, and 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && + desc.k0 != 16, "K0 can only be: 1, 2, 3, 4, 8, and 16"); - if(desc.export_weights_to_cl_image) + if (desc.export_weights_to_cl_image) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16"); @@ -106,9 +128,9 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } } - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -118,20 +140,19 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(0) != weights->dimension(3), "Biases size and number of dst feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, - "Biases should be one dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } const auto data_type = src->data_type(); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -140,7 +161,8 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co float multiplier = iqinfo.scale * wqinfo.scale / oqinfo.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); } return Status{}; } @@ -151,8 +173,14 @@ ClDirectConv2dKernel::ClDirectConv2dKernel() _type = CLKernelType::DIRECT; } -void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -178,14 +206,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, - 1, - src->data_type(), - src->quantization_info()); + auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); // Configure kernel window Window win; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { output_shape.collapse(2U, 1U); const unsigned int n0 = adjust_vec_size(desc.n0, output_shape[0]); @@ -194,7 +219,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT // Create window and update padding win = calculate_max_window(output_shape, Steps(n0, m0)); } - else if(_data_layout == DataLayout::NCHW) + else if (_data_layout == DataLayout::NCHW) { _num_elems_processed_per_iteration = 1u; win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration)); @@ -205,7 +230,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT std::stringstream kernel_name; CLBuildOptions build_options; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { kernel_name << "direct_convolution_nhwc"; @@ -221,22 +246,22 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT _export_output_to_cl_image = desc.export_output_to_cl_image; // Update the padding for the weights tensor if we can export to cl_image - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { gemm::update_padding_for_cl_image(weights); } - if(_export_output_to_cl_image) + if (_export_output_to_cl_image) { gemm::update_padding_for_cl_image(dst); } - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { gemm::update_padding_for_cl_image(src); } - if(biases != nullptr) + if (biases != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); @@ -246,9 +271,10 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT const auto act_function = act_info.activation(); const auto dst_data_type = dst->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (dst_data_type == DataType::F32 || dst_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -259,7 +285,8 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-cl-fast-relaxed-math"); } - build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", "-DSRC_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_input_to_cl_image, "-DSRC_TENSOR_TYPE=IMAGE", + "-DSRC_TENSOR_TYPE=BUFFER"); build_options.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_options.add_option("-DSRC_CHANNELS=" + support::cpp11::to_string(src->dimension(0))); build_options.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(1))); @@ -267,9 +294,11 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(0))); build_options.add_option("-DDST_WIDTH=" + support::cpp11::to_string(dst->dimension(1))); build_options.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(2))); - build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", "-DDST_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_output_to_cl_image, "-DDST_TENSOR_TYPE=IMAGE", + "-DDST_TENSOR_TYPE=BUFFER"); build_options.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst_data_type)); - build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", "-DWEI_TENSOR_TYPE=BUFFER"); + build_options.add_option_if_else(_export_weights_to_cl_image, "-DWEI_TENSOR_TYPE=IMAGE", + "-DWEI_TENSOR_TYPE=BUFFER"); build_options.add_option("-DWEI_WIDTH=" + support::cpp11::to_string(weights->dimension(width_idx))); build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); build_options.add_option("-DWEI_DATA_TYPE=" + get_cl_type_from_data_type(weights->data_type())); @@ -284,7 +313,7 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option_if((src->dimension(channel_idx) % k0) != 0, "-DLEFTOVER_LOOP"); build_options.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_function))); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -314,11 +343,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DSRC_OFFSET=" + support::cpp11::to_string(0)); build_options.add_option("-DWEI_OFFSET=" + support::cpp11::to_string(0)); build_options.add_option("-DDST_OFFSET=" + support::cpp11::to_string(0)); - build_options.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); - build_options.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); + build_options.add_option_if(act_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); + build_options.add_option_if(act_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); } - if(compile_context.get_ddk_version() >= 30) + if (compile_context.get_ddk_version() >= 30) { build_options.add_option("-fregister-allocation=64"); } @@ -340,13 +371,17 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT build_options.add_option("-DWEI_HEIGHT=" + support::cpp11::to_string(weights->dimension(height_idx))); build_options.add_option(std::string("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type))); build_options.add_option(std::string("-DDATA_SIZE=" + get_data_size_from_data_type(data_type))); - build_options.add_option(std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); + build_options.add_option( + std::string("-DWEIGHTS_DEPTH=" + support::cpp11::to_string(weights->dimension(channel_idx)))); build_options.add_option(std::string("-DSTRIDE_X=" + support::cpp11::to_string(conv_stride_x))); build_options.add_option(std::string("-DDATA_TYPE_PROMOTED=" + get_cl_type_from_data_type(data_type))); - build_options.add_option(std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration))); - build_options.add_option(std::string("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration))); + build_options.add_option( + std::string("-DVEC_SIZE=" + support::cpp11::to_string(_num_elems_processed_per_iteration))); + build_options.add_option( + std::string("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration))); - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -405,8 +440,13 @@ void ClDirectConv2dKernel::configure(const CLCompileContext &compile_context, IT _config_id += lower_string(string_from_data_layout(_data_layout)); } -Status ClDirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status ClDirectConv2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, desc)); return Status{}; @@ -420,52 +460,55 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl // Get initial windows Window slice = window.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { cl::Image2D weights_cl_image; cl::Image2D output_cl_image; cl::Image2D input_cl_image; - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { // Export tensor to cl_image weights_cl_image = create_image2d_from_tensor(weights, CLImage2DType::ReadOnly); } - if(_export_output_to_cl_image) + if (_export_output_to_cl_image) { // Export tensor to cl_image output_cl_image = create_image2d_from_tensor(dst, CLImage2DType::WriteOnly); } - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { // Export tensor to cl_image input_cl_image = create_image2d_from_tensor(src, CLImage2DType::ReadOnly); } unsigned int idx = 0; - if(_export_input_to_cl_image) + if (_export_input_to_cl_image) { _kernel.setArg(idx++, input_cl_image); } add_4d_tensor_nhwc_argument(idx, src); - if(_export_output_to_cl_image) + if (_export_output_to_cl_image) { _kernel.setArg(idx++, output_cl_image); } add_4d_tensor_nhwc_argument(idx, dst); - if(_export_weights_to_cl_image) + if (_export_weights_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } add_4d_tensor_nhwc_argument(idx, weights); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } @@ -476,7 +519,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl unsigned int idx1 = 2 * num_arguments_per_3D_tensor(); add_3D_tensor_argument(idx1, weights, slice); - if(biases != nullptr) + if (biases != nullptr) { Window slice_biases; slice_biases.use_tensor_dimensions(biases->info()->tensor_shape()); @@ -491,8 +534,7 @@ void ClDirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, cl add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } } // namespace kernels diff --git a/src/gpu/cl/kernels/ClDirectConv2dKernel.h b/src/gpu/cl/kernels/ClDirectConv2dKernel.h index 7132762b35..c934c825ca 100644 --- a/src/gpu/cl/kernels/ClDirectConv2dKernel.h +++ b/src/gpu/cl/kernels/ClDirectConv2dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -68,16 +69,27 @@ public: * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo. * @param[in] desc Direct convolution descriptor used to build the NHWC direct convolution kernel. For NCHW, this parameter is ignored. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); /** Static function to check if given info will lead to a valid configuration * * Similar to ClDirectConv2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -85,9 +97,9 @@ public: public: DataLayout _data_layout{}; PadStrideInfo _conv_info{}; - bool _export_weights_to_cl_image{ false }; - bool _export_output_to_cl_image{ false }; - bool _export_input_to_cl_image{ false }; + bool _export_weights_to_cl_image{false}; + bool _export_output_to_cl_image{false}; + bool _export_input_to_cl_image{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp index 6191178911..8002520a87 100644 --- a/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp +++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -40,7 +41,11 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_LAYOUT(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported"); @@ -49,20 +54,25 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv3d_info.act_info.enabled(), "Fused activation not supported"); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); ARM_COMPUTE_RETURN_ERROR_ON(conv3d_info.dilation != Size3D(1U, 1U, 1U)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->dimension(1) != src0->dimension(0), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 5, "Weights can be at most 5 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right)); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom)); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(2) > + (src0->dimension(1) + conv3d_info.padding.left + conv3d_info.padding.right)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(3) > + (src0->dimension(2) + conv3d_info.padding.top + conv3d_info.padding.bottom)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(4) > + (src0->dimension(3) + conv3d_info.padding.front + conv3d_info.padding.back)); - if(src2 != nullptr) + if (src2 != nullptr) { - if(is_data_type_quantized(src0->data_type())) + if (is_data_type_quantized(src0->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::S32); } @@ -70,15 +80,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), "Biases size and number of dst feature maps should match"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->dimension(0) != src1->dimension(0), + "Biases size and number of dst feature maps should match"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(src2->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src1->dimension(0), "Weights and dst OFMs should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), + misc::shape_calculator::compute_conv3d_shape(src0->tensor_shape(), src1->tensor_shape(), conv3d_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -91,8 +104,12 @@ ClDirectConv3dKernel::ClDirectConv3dKernel() _type = CLKernelType::DIRECT; } -void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, - const Conv3dInfo &conv3d_info) +void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); @@ -149,13 +166,13 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co build_options.add_option("-DK0=" + support::cpp11::to_string(k0)); build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); - if(src2 != nullptr) + if (src2 != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(src2->data_type()))); } - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { const UniformQuantizationInfo iqinfo = src0->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = src1->quantization_info().uniform(); @@ -218,7 +235,11 @@ void ClDirectConv3dKernel::configure(const CLCompileContext &compile_context, co _config_id += support::cpp11::to_string(dst_channels); } -Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status ClDirectConv3dKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, conv3d_info)); return Status{}; @@ -229,21 +250,28 @@ void ClDirectConv3dKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); // Get initial windows Window slice = window.first_slice_window_3D(); - slice.set(Window::DimY, Window::Dimension(0, ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * dst->info()->dimension(3), slice.y().step()), slice.y().step())); + slice.set(Window::DimY, Window::Dimension(0, + ceil_to_multiple(dst->info()->dimension(1) * dst->info()->dimension(2) * + dst->info()->dimension(3), + slice.y().step()), + slice.y().step())); slice.set(Window::DimZ, Window::Dimension(0, dst->info()->dimension(4), 1)); unsigned int idx = 0; add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice); add_4D_tensor_argument(idx, weights, slice); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } diff --git a/src/gpu/cl/kernels/ClDirectConv3dKernel.h b/src/gpu/cl/kernels/ClDirectConv3dKernel.h index de4f0ce216..cb7509d8fa 100644 --- a/src/gpu/cl/kernels/ClDirectConv3dKernel.h +++ b/src/gpu/cl/kernels/ClDirectConv3dKernel.h @@ -70,14 +70,23 @@ public: * @param[out] dst Destination tensor. 4 lower dimensions represent a single dst [OFM, width, height, depth], while the rest represent batch of dsts. * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClDirectConv3dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.cpp b/src/gpu/cl/kernels/ClElementwiseKernel.cpp index 6beee576b5..cdb3527a92 100644 --- a/src/gpu/cl/kernels/ClElementwiseKernel.cpp +++ b/src/gpu/cl/kernels/ClElementwiseKernel.cpp @@ -23,18 +23,20 @@ */ #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/common/utils/Validate.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" + #include <map> namespace arm_compute @@ -47,25 +49,20 @@ namespace { constexpr unsigned int vector_size_byte_opencl = 16; -std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = -{ - { ArithmeticOperation::ADD, "ADD" }, - { ArithmeticOperation::SUB, "SUB" }, - { ArithmeticOperation::DIV, "DIV" }, - { ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF" }, - { ArithmeticOperation::MIN, "MIN" }, - { ArithmeticOperation::MAX, "MAX" }, - { ArithmeticOperation::POWER, "POWER" }, - { ArithmeticOperation::PRELU, "PRELU" }, +std::map<ArithmeticOperation, std::string> supported_arithmetic_ops = { + {ArithmeticOperation::ADD, "ADD"}, {ArithmeticOperation::SUB, "SUB"}, + {ArithmeticOperation::DIV, "DIV"}, {ArithmeticOperation::SQUARED_DIFF, "SQUARED_DIFF"}, + {ArithmeticOperation::MIN, "MIN"}, {ArithmeticOperation::MAX, "MAX"}, + {ArithmeticOperation::POWER, "POWER"}, {ArithmeticOperation::PRELU, "PRELU"}, }; -std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = -{ - { ArithmeticOperation::ADD, "ADD" }, - { ArithmeticOperation::SUB, "SUB" }, +std::map<ArithmeticOperation, std::string> supported_sat_arithmetic_ops = { + {ArithmeticOperation::ADD, "ADD"}, + {ArithmeticOperation::SUB, "SUB"}, }; -std::string generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +std::string +generate_id_for_tuning_common(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) { std::string config_id; // Set config_id for enabling LWS tuning @@ -79,12 +76,18 @@ std::string generate_id_for_tuning_common(const std::string &kernel_name, const return config_id; } -Status validate_in_place_output_shape(const bool in_place, const bool src1_in_place, const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const TensorShape &out_shape) +Status validate_in_place_output_shape(const bool in_place, + const bool src1_in_place, + const ITensorInfo &src1, + const ITensorInfo &src2, + const ITensorInfo &dst, + const TensorShape &out_shape) { - if(in_place) + if (in_place) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0), - "Wrong shape for dst, cannot do in_place calculation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, src1_in_place ? src1.tensor_shape() : src2.tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); } else { @@ -94,7 +97,9 @@ Status validate_in_place_output_shape(const bool in_place, const bool src1_in_pl return Status{}; } -Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src1, + const ITensorInfo &src2, + const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(&src1, &src2, &dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); @@ -110,11 +115,12 @@ Status validate_arguments_with_float_only_supported_rules(const ITensorInfo &src ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); } return Status{}; @@ -136,25 +142,27 @@ Status validate_arguments_divide_operation(const ITensorInfo *src1, const ITenso ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::F16, DataType::F32, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_in_place_output_shape(in_place, src1_in_place, *src1, *src2, *dst, out_shape)); } return Status{}; } -Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +Status +validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src1, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &src2); - if(is_data_type_quantized_symmetric(src1.data_type())) + if (is_data_type_quantized_symmetric(src1.data_type())) { const int32_t in1_offset = src1.quantization_info().uniform().offset; const int32_t in2_offset = src2.quantization_info().uniform().offset; @@ -170,13 +178,15 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src1, &dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), "Wrong shape for dst"); - ARM_COMPUTE_RETURN_ON_ERROR(validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst.tensor_shape(), 0), + "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_in_place_output_shape(in_place, src1_in_place, src1, src2, dst, out_shape)); - if(is_data_type_quantized_symmetric(dst.data_type())) + if (is_data_type_quantized_symmetric(dst.data_type())) { const int32_t offset = dst.quantization_info().uniform().offset; ARM_COMPUTE_RETURN_ERROR_ON_MSG(offset != 0, "For quantized symmetric, offset must be zero"); @@ -185,19 +195,26 @@ Status validate_arguments_with_arithmetic_rules(const ITensorInfo &src1, const I return Status{}; } -CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst, const std::string &operation_string) +CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &src1, + const ITensorInfo &src2, + const ITensorInfo &dst, + const std::string &operation_string) { CLBuildOptions build_opts; - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src1.data_type())); - build_opts.add_option("-DVEC_SIZE_IN1=" + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_IN2=" + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN1=" + + support::cpp11::to_string(src1.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_IN2=" + + support::cpp11::to_string(src2.dimension(0) == 1 ? 1 : num_elems_processed_per_iteration)); build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(dst.dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DOP=" + operation_string); - if(is_data_type_quantized(src1.data_type())) + if (is_data_type_quantized(src1.data_type())) { const UniformQuantizationInfo iq1info = src1.quantization_info().uniform(); const UniformQuantizationInfo iq2info = src2.quantization_info().uniform(); @@ -223,14 +240,17 @@ CLBuildOptions generate_build_options_with_arithmetic_rules(const ITensorInfo &s std::pair<Status, Window> configure_window_arithmetic_common(ITensorInfo &dst) { - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); - Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / dst.element_size(), dst.dimension(0)); + Window win = calculate_max_window(dst, Steps(num_elems_processed_per_iteration)); return std::make_pair(Status{}, win); } -std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair<Status, Window> +validate_and_configure_window_for_arithmetic_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(src1, src2); const TensorShape &out_shape = broadcast_pair.first; auto_init_if_empty(dst, out_shape, 1, src1.data_type()); @@ -238,9 +258,11 @@ std::pair<Status, Window> validate_and_configure_window_for_arithmetic_operators return configure_window_arithmetic_common(dst); } -std::pair<Status, Window> validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair<Status, Window> +validate_and_configure_window_for_logical_binary_operators(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(src1, src2); const TensorShape &out_shape = broadcast_pair.first; set_shape_if_empty(dst, out_shape); @@ -249,9 +271,11 @@ std::pair<Status, Window> validate_and_configure_window_for_logical_binary_opera return configure_window_arithmetic_common(dst); } -std::pair<Status, Window> validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair<Status, Window> +validate_and_configure_window_for_division(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - const std::pair<TensorShape, ValidRegion> broadcast_pair = ITensorInfo::broadcast_shape_and_valid_region(src1, src2); + const std::pair<TensorShape, ValidRegion> broadcast_pair = + ITensorInfo::broadcast_shape_and_valid_region(src1, src2); const TensorShape &out_shape = broadcast_pair.first; auto_init_if_empty(dst, out_shape, 1, src1.data_type()); @@ -265,21 +289,24 @@ ClElementwiseKernel::ClElementwiseKernel() _type = CLKernelType::ELEMENTWISE; } -void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +void ClElementwiseKernel::configure_common(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst) { // Configure kernel window auto win_config = validate_and_configure_window(*src1, *src2, *dst); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); std::string kernel_name = "elementwise_operation_" + name(); - if(is_data_type_quantized(src1->data_type())) + if (is_data_type_quantized(src1->data_type())) { kernel_name += "_quantized"; } // Set kernel build options CLBuildOptions build_opts = generate_build_options(*src1, *src2, *dst); - if(_act_info.enabled()) + if (_act_info.enabled()) { build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(_act_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(_act_info.a())); @@ -299,9 +326,11 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src_0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); @@ -311,17 +340,18 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c bool can_collapse = true; const bool is_vector = in_shape1.num_dimensions() == 1 || in_shape2.num_dimensions() == 1; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1 && !is_vector) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); d++) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -337,7 +367,7 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c unsigned int idx = 0; add_3D_tensor_argument(idx, src_0, slice_src1); add_3D_tensor_argument(idx, src_1, slice_src2); - if(!in_place) + if (!in_place) { add_3D_tensor_argument(idx, dst, slice); } @@ -345,13 +375,16 @@ void ClElementwiseKernel::run_op(ITensorPack &tensors, const Window &window, ::c enqueue(queue, *this, slice, lws_hint()); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_src2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } /** Logical binary */ -void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, + LogicalOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(ClLogicalBinaryKernel::validate(op, src1, src2, dst)); @@ -359,7 +392,10 @@ void ClLogicalBinaryKernel::configure(const ClCompileContext &compile_context, L configure_common(compile_context, src1, src2, dst); } -Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +Status ClLogicalBinaryKernel::validate(LogicalOperation op, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst) { ARM_COMPUTE_UNUSED(op); ARM_COMPUTE_ASSERT(op != LogicalOperation::Unknown && op != LogicalOperation::Not); @@ -369,14 +405,16 @@ Status ClLogicalBinaryKernel::validate(LogicalOperation op, const ITensorInfo *s ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_logical_binary_operators(*src1->clone(), *src2->clone(), *dst->clone()) + .first); return Status{}; } std::string ClLogicalBinaryKernel::name() { - switch(_op) + switch (_op) { case LogicalOperation::And: return "AND"; @@ -390,30 +428,38 @@ std::string ClLogicalBinaryKernel::name() return ""; } -std::pair<Status, Window> ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair<Status, Window> +ClLogicalBinaryKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { return validate_and_configure_window_for_logical_binary_operators(src1, src2, dst); } -CLBuildOptions ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +CLBuildOptions +ClLogicalBinaryKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) { // The arithmetic utility functions can be share return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); } -std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +std::string ClLogicalBinaryKernel::generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &src1, + const ITensorInfo &dst) { return generate_id_for_tuning_common(kernel_name, src1, dst); } /** Arithmetic operations with saturation*/ -void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, +void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output, const ConvertPolicy &policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_ERROR_THROW_ON(ClSaturatedArithmeticKernel::validate(op, input1, input2, output, policy, act_info)); - auto padding_info = get_padding_info({ input1, input2, output }); + auto padding_info = get_padding_info({input1, input2, output}); _policy = policy; _op = op; @@ -422,24 +468,34 @@ void ClSaturatedArithmeticKernel::configure(const ClCompileContext &compile_cont ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, +Status ClSaturatedArithmeticKernel::validate(ArithmeticOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ConvertPolicy &policy, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(op, policy); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input1, input2, output); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*input1, *input2, *output)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_arithmetic_operators(*input1->clone(), *input2->clone(), *output->clone()) + .first); ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(output->data_type())); return Status{}; } -std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) +std::pair<Status, Window> ClSaturatedArithmeticKernel::validate_and_configure_window(ITensorInfo &input1, + ITensorInfo &input2, + ITensorInfo &output) { return validate_and_configure_window_for_arithmetic_operators(input1, input2, output); } -CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) +CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensorInfo &input1, + const ITensorInfo &input2, + const ITensorInfo &output) { const bool has_float_out = is_data_type_float(output.data_type()); auto build_options = generate_build_options_with_arithmetic_rules(input1, input2, output, name()); @@ -447,7 +503,9 @@ CLBuildOptions ClSaturatedArithmeticKernel::generate_build_options(const ITensor return build_options; } -std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) +std::string ClSaturatedArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &input1, + const ITensorInfo &output) { auto config_id = generate_id_for_tuning_common(kernel_name, input1, output); config_id += (_policy == ConvertPolicy::WRAP) ? "_wrap_" : "_saturate_"; @@ -461,12 +519,16 @@ std::string ClSaturatedArithmeticKernel::name() } /** Arithmetic operations*/ -void ClArithmeticKernel::configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, +void ClArithmeticKernel::configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(ClArithmeticKernel::validate(op, src1, src2, dst, act_info)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); _op = op; _act_info = act_info; @@ -474,33 +536,42 @@ void ClArithmeticKernel::configure(const ClCompileContext &compile_context, Arit ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClArithmeticKernel::validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClArithmeticKernel::validate(ArithmeticOperation op, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); - if(op == ArithmeticOperation::DIV) + if (op == ArithmeticOperation::DIV) { // Partial integer support S32/F32/F16 ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_divide_operation(src1, src2, dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); } - else if(op == ArithmeticOperation::POWER) + else if (op == ArithmeticOperation::POWER) { // Power operators doesn't support integer arithmetic ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_float_only_supported_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_division(*src1->clone(), *src2->clone(), *dst->clone()).first); } else { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_with_arithmetic_rules(*src1, *src2, *dst)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window_for_arithmetic_operators(*src1->clone(), *src2->clone(), *dst->clone()) + .first); } ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); return Status{}; } -std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) +std::pair<Status, Window> +ClArithmeticKernel::validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) { - if(_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER) + if (_op == ArithmeticOperation::DIV || _op == ArithmeticOperation::POWER) { // Division and Power operators don't support integer arithmetic return validate_and_configure_window_for_division(src1, src2, dst); @@ -511,11 +582,14 @@ std::pair<Status, Window> ClArithmeticKernel::validate_and_configure_window(ITen } } -CLBuildOptions ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) +CLBuildOptions +ClArithmeticKernel::generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) { return generate_build_options_with_arithmetic_rules(src1, src2, dst, name()); } -std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) +std::string ClArithmeticKernel::generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &src1, + const ITensorInfo &dst) { return generate_id_for_tuning_common(kernel_name, src1, dst); } diff --git a/src/gpu/cl/kernels/ClElementwiseKernel.h b/src/gpu/cl/kernels/ClElementwiseKernel.h index ea3ddb2124..73e54542b2 100644 --- a/src/gpu/cl/kernels/ClElementwiseKernel.h +++ b/src/gpu/cl/kernels/ClElementwiseKernel.h @@ -25,8 +25,9 @@ #define ARM_COMPUTE_CL_ELEMENTWISE_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" -#include "src/core/KernelTypes.h" + #include "src/core/common/Macros.h" +#include "src/core/KernelTypes.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -65,24 +66,28 @@ protected: * * @return a pair of Status and Window */ - virtual std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0; + virtual std::pair<Status, Window> + validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) = 0; /** Generate the build options for the specific kernel * * @reutrn a CLBuildOptions struct */ - virtual CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0; + virtual CLBuildOptions + generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) = 0; /** Generate the identifier for tuning * * @reutrn a string */ - virtual std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0; + virtual std::string + generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) = 0; /** Commmon configure function for element-wise operators with no additional options (e.g., Div, Min, Max, SquaredDiff) * */ - void configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + void + configure_common(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); ActivationLayerInfo _act_info{}; }; @@ -100,23 +105,31 @@ public: * @param[in] src2 Second source tensor info. Data types supported: same as @p src1. * @param[in] dst Destination tensor info. Data types supported: same as @p src1. */ - void configure(const ClCompileContext &compile_context, LogicalOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst); + void configure(const ClCompileContext &compile_context, + LogicalOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClLogicalBinaryKernel::configure() * * @return a status */ - static Status validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); + static Status + validate(LogicalOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst); private: // Inherited methods overridden: std::string name() override; - std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; - CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; - - LogicalOperation _op{ LogicalOperation::Unknown }; + std::pair<Status, Window> + validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; + CLBuildOptions + generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; + std::string + generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; + + LogicalOperation _op{LogicalOperation::Unknown}; }; /** Addition operation */ @@ -135,7 +148,12 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *input1, ITensorInfo *input2, ITensorInfo *output, const ConvertPolicy &policy, + void configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *input1, + ITensorInfo *input2, + ITensorInfo *output, + const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration @@ -144,15 +162,23 @@ public: * * @return a status */ - static Status validate(ArithmeticOperation op, const ITensorInfo *input1, const ITensorInfo *input2, const ITensorInfo *output, const ConvertPolicy &policy, + static Status validate(ArithmeticOperation op, + const ITensorInfo *input1, + const ITensorInfo *input2, + const ITensorInfo *output, + const ConvertPolicy &policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); protected: // Inherited methods overridden: std::string name() override; - std::pair<Status, Window> validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override; - CLBuildOptions generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &input1, const ITensorInfo &output) override; + std::pair<Status, Window> + validate_and_configure_window(ITensorInfo &input1, ITensorInfo &input2, ITensorInfo &output) override; + CLBuildOptions + generate_build_options(const ITensorInfo &input1, const ITensorInfo &input2, const ITensorInfo &output) override; + std::string generate_id_for_tuning(const std::string &kernel_name, + const ITensorInfo &input1, + const ITensorInfo &output) override; private: ConvertPolicy _policy{}; @@ -174,7 +200,11 @@ public: * @param[in] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ArithmeticOperation op, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, + void configure(const ClCompileContext &compile_context, + ArithmeticOperation op, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration @@ -183,14 +213,21 @@ public: * * @return a status */ - static Status validate(ArithmeticOperation op, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(ArithmeticOperation op, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); protected: // Inherited methods overridden: std::string name() override; - std::pair<Status, Window> validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; - CLBuildOptions generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; - std::string generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; + std::pair<Status, Window> + validate_and_configure_window(ITensorInfo &src1, ITensorInfo &src2, ITensorInfo &dst) override; + CLBuildOptions + generate_build_options(const ITensorInfo &src1, const ITensorInfo &src2, const ITensorInfo &dst) override; + std::string + generate_id_for_tuning(const std::string &kernel_name, const ITensorInfo &src1, const ITensorInfo &dst) override; private: ArithmeticOperation _op{}; diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp index 744a3a40c7..f7c198ee54 100644 --- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp +++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp @@ -23,11 +23,12 @@ */ #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" -#include "arm_compute/core/Utils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -46,17 +47,18 @@ constexpr unsigned int vector_size_byte_opencl = 16; Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); - if(op == ElementWiseUnary::LOGICAL_NOT) + if (op == ElementWiseUnary::LOGICAL_NOT) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::U8); } - else if(op == ElementWiseUnary::NEG) + else if (op == ElementWiseUnary::NEG) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32); } - else if(op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT. + else if (op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT. { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, + DataType::QASYMM8_SIGNED); } else { @@ -64,7 +66,7 @@ Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const } // Validate in case of configured dst - if(dst.total_size() > 0) + if (dst.total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); @@ -80,19 +82,23 @@ ClElementWiseUnaryKernel::ClElementWiseUnaryKernel() _type = CLKernelType::ELEMENTWISE; } -void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op) +void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const ElementWiseUnary &op) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op)); - const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0)); + const unsigned int num_elems_processed_per_iteration = + adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0)); - std::string kernel_name = "elementwise_unary"; - const int vec_size_x = num_elems_processed_per_iteration; - const int dst_width_x = dst->dimension(0); - if(is_data_type_quantized(src->data_type())) + std::string kernel_name = "elementwise_unary"; + const int vec_size_x = num_elems_processed_per_iteration; + const int dst_width_x = dst->dimension(0); + if (is_data_type_quantized(src->data_type())) { kernel_name += "_quantized"; } @@ -101,7 +107,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0))); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); @@ -110,7 +116,7 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale)); build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale)); } - switch(op) + switch (op) { case ElementWiseUnary::RSQRT: build_opts.add_option("-DOPERATION=rsqrt_op"); @@ -169,8 +175,9 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); do { @@ -178,8 +185,7 @@ void ClElementWiseUnaryKernel::run_op(ITensorPack &tensors, const Window &window add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h index 0f270f25e8..81721f8ca8 100644 --- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h +++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.h @@ -47,7 +47,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * @param[in] op Element wise unary operation to perform. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const ElementWiseUnary &op); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const ElementWiseUnary &op); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementWiseUnaryKernel::configure() diff --git a/src/gpu/cl/kernels/ClFillKernel.cpp b/src/gpu/cl/kernels/ClFillKernel.cpp index a9345ee334..96ad503730 100644 --- a/src/gpu/cl/kernels/ClFillKernel.cpp +++ b/src/gpu/cl/kernels/ClFillKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -47,9 +48,10 @@ ClFillKernel::ClFillKernel() _type = CLKernelType::ELEMENTWISE; } -void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInfo *tensor, - const PixelValue &constant_value, - Window *window) +void ClFillKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window) { ARM_COMPUTE_ERROR_ON_NULLPTR(tensor); ARM_COMPUTE_ERROR_THROW_ON(validate(tensor, constant_value, window)); @@ -60,7 +62,7 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf // Create and update the window (if needed) _full_window = calculate_max_window(*tensor); Window win = _full_window; - if(window != nullptr) + if (window != nullptr) { ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(win, *window); win = *window; @@ -70,9 +72,10 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf const bool multi_access_x = output_width_x >= vec_size_x; const bool remainder_x = output_width_x % vec_size_x > 0; - if(multi_access_x) + if (multi_access_x) { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -81,7 +84,9 @@ void ClFillKernel::configure(const CLCompileContext &compile_context, ITensorInf build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(constant_value, data_type)); build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option_if(multi_access_x && remainder_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); + build_opts.add_option_if(multi_access_x && remainder_x, + "-DLAST_ACCESSED_X=" + + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); _kernel = create_kernel(compile_context, "memset", build_opts.options()); } @@ -89,7 +94,7 @@ Status ClFillKernel::validate(const ITensorInfo *tensor, const PixelValue &const { ARM_COMPUTE_UNUSED(tensor); ARM_COMPUTE_UNUSED(constant_value); - if(window != nullptr) + if (window != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(window->x().step() != 1); } @@ -101,7 +106,8 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto tensor = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto tensor = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); // Collapse all the batches on the third Window collapsed = window.collapse_if_possible(_full_window, Window::DimZ); @@ -112,8 +118,7 @@ void ClFillKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comman unsigned int idx = 0; add_3D_tensor_argument(idx, tensor, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClFillKernel.h b/src/gpu/cl/kernels/ClFillKernel.h index f25cf928ad..5d69fbfbd1 100644 --- a/src/gpu/cl/kernels/ClFillKernel.h +++ b/src/gpu/cl/kernels/ClFillKernel.h @@ -47,7 +47,10 @@ public: * @param[in] constant_value The value used to fill the planes of the tensor * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClFillKernel::configure() diff --git a/src/gpu/cl/kernels/ClFloorKernel.cpp b/src/gpu/cl/kernels/ClFloorKernel.cpp index f9f834875a..358e84012b 100644 --- a/src/gpu/cl/kernels/ClFloorKernel.cpp +++ b/src/gpu/cl/kernels/ClFloorKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -52,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); // Validate in case of configured output - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -76,9 +77,9 @@ void ClFloorKernel::configure(const ClCompileContext &compile_context, const ITe // Validate ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); + const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); @@ -105,8 +106,9 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(IClKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); @@ -117,8 +119,7 @@ void ClFloorKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp index accafeecc2..e0d925dfb2 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.cpp @@ -29,14 +29,13 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "src/core/AccessWindowStatic.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -50,26 +49,35 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(src0->data_type() == DataType::QASYMM8) + if (src0->data_type() == DataType::QASYMM8) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); } else { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m(); @@ -83,7 +91,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k)); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != static_cast<unsigned int>(n)); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != static_cast<unsigned int>(k)); - if(gemm_info.reinterpret_input_as_3d()) + if (gemm_info.reinterpret_input_as_3d()) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m)); } @@ -92,9 +100,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != static_cast<unsigned int>(m)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -102,8 +111,13 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info, ElementsProcessed &num_elements_processed) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info, + ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; @@ -115,17 +129,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_dst_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_dst_as_3d) + if (reinterpret_input_as_3d == reinterpret_dst_as_3d) { reinterpret_dst_as_3d = false; } // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + auto_init_if_empty(*dst, src0->clone() + ->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)) + .set_data_type(DataType::S32)); TensorInfo tmp_info(*dst); - if(reinterpret_dst_as_3d) + if (reinterpret_dst_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -138,12 +154,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // RHS matrix still needs padding on the X - AccessWindowStatic src1_access(src1, 0, 0, - ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), - src1->dimension(1)); + AccessWindowStatic src1_access( + src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1)); window_changed = update_window_and_padding(win, src1_access); // window used by the execute_window_loop @@ -153,7 +169,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -163,8 +180,13 @@ ClGemmLowpMatrixMultiplyNativeKernel::ClGemmLowpMatrixMultiplyNativeKernel() _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); @@ -175,11 +197,11 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); // We still need padding on the X dimension for the RHS matrix - auto padding_info = get_padding_info({ src0, dst }); + auto padding_info = get_padding_info({src0, dst}); // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_dst_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -192,7 +214,8 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = + validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -212,8 +235,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); build_opts.add_option("-DM=" + support::cpp11::to_string(src0->dimension(1))); @@ -258,19 +283,19 @@ void ClGemmLowpMatrixMultiplyNativeKernel::configure(const CLCompileContext &com ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +Status ClGemmLowpMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ElementsProcessed num_elements_processed{}; ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), + dst->clone().get(), lhs_info, rhs_info, gemm_info, num_elements_processed) - .first); + .first); return Status{}; } @@ -280,11 +305,13 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -296,7 +323,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; @@ -304,10 +331,10 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the output has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } @@ -317,7 +344,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -330,8 +357,7 @@ void ClGemmLowpMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Wi _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h index 4b328e0ab8..4f87096158 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyNativeKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_NATIVE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -55,25 +56,34 @@ public: * rhs_info.k0: same as lhs_info.k0 * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyNativeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp index 15493f7ddc..ddbc809cdd 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.cpp @@ -29,13 +29,12 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -51,45 +50,55 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose); ARM_COMPUTE_RETURN_ERROR_ON(!rhs_info.transpose); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m(); const int n = gemm_info.n(); const int k = gemm_info.k(); - TensorShape tensor_shape0{ src0->tensor_shape() }; + TensorShape tensor_shape0{src0->tensor_shape()}; tensor_shape0.set(0, k); tensor_shape0.set(1, m); - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info_reshaped0 = + src0->clone()->set_tensor_shape(compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); @@ -99,19 +108,24 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info, - ElementsProcessed &num_elements_processed) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info, + ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; bool reinterpret_output_as_3d = (gemm_info.depth_output_gemm3d() != 0); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(compute_mm_shape(*src0, *src1, gemm_info)).set_data_type(DataType::S32)); TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -123,7 +137,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, // Configure kernel window num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -140,8 +155,13 @@ ClGemmLowpMatrixMultiplyReshapedKernel::ClGemmLowpMatrixMultiplyReshapedKernel() _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); @@ -154,11 +174,12 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c const unsigned int num_dimensionssrc0 = src0->num_dimensions(); _slide_matrix_b = (src1->num_dimensions() >= num_dimensionssrc0); - auto padding_info = get_padding_info({ src0, src1, dst }); + auto padding_info = get_padding_info({src0, src1, dst}); ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = + validate_and_configure_window(src0, src1, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -171,8 +192,10 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c // Create build options CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); @@ -230,19 +253,19 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::configure(const CLCompileContext &c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info) +Status ClGemmLowpMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info) { ElementsProcessed num_elements_processed{}; ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), + dst->clone().get(), lhs_info, rhs_info, gemm_info, num_elements_processed) - .first); + .first); return Status{}; } @@ -252,11 +275,13 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -268,7 +293,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 4; @@ -281,7 +306,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -295,8 +320,7 @@ void ClGemmLowpMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h index a16f500f11..d7b785996f 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -64,25 +65,34 @@ public: * * @note lhs_info.k0 must be equal to rhs_info.k0 */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMReshapeInfo &gemm_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyReshapedKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMReshapeInfo &gemm_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMReshapeInfo &gemm_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_output_as_3d{ false }; - unsigned int _k{ 1 }; - bool _use_dummy_work_items{ false }; + bool _slide_matrix_b{true}; + bool _reinterpret_output_as_3d{false}; + unsigned int _k{1}; + bool _use_dummy_work_items{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp index 5d552b8d63..2f1f3b8df0 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -29,14 +29,13 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/AccessWindowStatic.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -54,45 +53,57 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - if(src0->data_type() == DataType::QASYMM8) + if (src0->data_type() == DataType::QASYMM8) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); } else { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::QASYMM8, DataType::QSYMM8, + DataType::QASYMM8_SIGNED, DataType::QSYMM8_PER_CHANNEL); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3) || (rhs_info.k0 > 16)), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3) || rhs_info.n0 > 16), + "Only 2,3,4,8,16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m; const int n = gemm_info.n; const int k = gemm_info.k; - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k)); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m)); } @@ -103,11 +114,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - if(output_stage.type == GEMMLowpOutputStageType::NONE) + if (output_stage.type == GEMMLowpOutputStageType::NONE) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -117,39 +128,42 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || + (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); // Checks performed if the dst stage needs to be fused - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // If a_offset == 0, vector_sum_col can be a nullptr - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); } // If b_offset == 0, vector_sum_row can be a nullptr - if(gemm_info.b_offset != 0) + if (gemm_info.b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (expected_dst_shape[1] * expected_dst_shape[2])); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); - if(expected_dst_shape.num_dimensions() > 1) + if (expected_dst_shape.num_dimensions() > 1) { const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -161,30 +175,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], "vector_sum_row must have the same number of batches of dst tensor"); - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - if(output_multipliers != nullptr && output_shifts != nullptr) + if (output_multipliers != nullptr && output_shifts != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) + if (output_stage.is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); @@ -194,9 +210,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts, + ElementsProcessed &num_elements_processed) { const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -211,16 +234,17 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + if (reinterpret_input_as_3d == reinterpret_output_as_3d) { reinterpret_output_as_3d = false; } // dst tensor auto initialization if not yet initialized const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(output_stage.type != GEMMLowpOutputStageType::NONE) + if (output_stage.type != GEMMLowpOutputStageType::NONE) { - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); } else { @@ -229,7 +253,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -242,12 +266,14 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, num_elems_processed_per_iteration_x = gemm_info.rhs_info.n0; num_elems_processed_per_iteration_y = gemm_info.lhs_info.m0; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win_out, vector_sum_col_access); @@ -255,17 +281,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, // No access window needed for vector_sum_row ARM_COMPUTE_UNUSED(vector_sum_row); - if(bias != nullptr) + if (bias != nullptr) { AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win_out, bias_access); } - if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) + if (output_multipliers != nullptr && output_stage.is_quantized_per_channel) { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, + num_elems_processed_per_iteration_x); AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); + window_changed = + window_changed || update_window_and_padding(win_out, output_multipliers_access, output_shifts_access); } } @@ -275,7 +303,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -285,15 +314,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::ClGemmLowpMatrixMultiplyReshapedO _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); - auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); + auto padding_info = get_padding_info({src0, src1, dst, vector_sum_row}); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -307,7 +343,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -320,7 +356,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -341,8 +378,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon CLBuildOptions build_opts; build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(rhs_info.interleave, "-DRHS_INTERLEAVE"); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); @@ -361,12 +400,12 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { kernel_name += "_fused_output_stage_fixedpoint"; _fuse_output_stage = true; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0 && vector_sum_col != nullptr) + if (a_offset != 0 && vector_sum_col != nullptr) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -377,9 +416,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); // In case of _is_quantized_per_channel, RESULT_MULTIPLIER and RESULT_SHIFT are not utilized, but they are passed as a part of T_QUANTIZE8 macro. - if(!_is_quantized_per_channel) + if (!_is_quantized_per_channel) { - build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); + build_opts.add_option("-DRESULT_MULTIPLIER=" + + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage.gemmlowp_shifts[0])); } else @@ -432,42 +472,56 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - bias != nullptr ? bias->clone().get() : nullptr, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr, - num_elements_processed) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info, + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr, + num_elements_processed) + .first); return Status{}; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto output_shifts = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); - const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - - if(src1->info()->num_dimensions() < 3) + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto bias = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -479,7 +533,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3; @@ -487,10 +541,10 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor - const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); + const unsigned int idx0 = 3 * num_arguments_per_2D_tensor() + 3 + (_reinterpret_input_as_3d ? 1 : 0); const unsigned int total_cross_plane_pad = dst->info()->padding().top + dst->info()->padding().bottom; _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } @@ -515,7 +569,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -527,19 +581,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor idx++; } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor idx++; } - if(_fuse_output_stage) + if (_fuse_output_stage) { add_2D_tensor_argument_if((vector_sum_col != nullptr), idx, vector_sum_col, win_vector_sum_col); add_2D_tensor_argument_if((vector_sum_row != nullptr), idx, vector_sum_row, win_vector_sum_row); @@ -548,8 +602,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); } enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h index a77604db7c..1d4696b089 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -70,31 +71,44 @@ public: * @param[in] output_shifts (Optional) Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). * Supported data types: S32. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, - ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + ITensorInfo *bias = nullptr, + ITensorInfo *output_multipliers = nullptr, + ITensorInfo *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, - const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + const ITensorInfo *bias = nullptr, + const ITensorInfo *output_multipliers = nullptr, + const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _is_quantized_per_channel{ false }; - bool _fuse_output_stage{ false }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _is_quantized_per_channel{false}; + bool _fuse_output_stage{false}; }; } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp index 792c71da76..030c11d069 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp @@ -23,16 +23,15 @@ */ #include "src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" namespace arm_compute @@ -47,39 +46,51 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), + "The extension cl_arm_matrix_multiply is not supported on the target platform"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.k0 != 4 || lhs_info.k0 != 4, "Only 4 is supported as value for k0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), "Only 1,2,4 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), "Only 1,4,8 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(lhs_info.m0 == 1 || lhs_info.m0 == 2 || lhs_info.m0 == 4), + "Only 1,2,4 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!(rhs_info.n0 == 1 || rhs_info.n0 == 4 || rhs_info.n0 == 8), + "Only 1,4,8 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for quantized GEMM"); const int m = gemm_info.m; const int n = gemm_info.n; const int k = gemm_info.k; - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != static_cast<unsigned int>(k)); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != static_cast<unsigned int>(m)); } @@ -90,11 +101,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_dst_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); - if(output_stage.type == GEMMLowpOutputStageType::NONE) + if (output_stage.type == GEMMLowpOutputStageType::NONE) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -104,38 +115,41 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != bias->dimension(0)); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN) || + (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FLOAT), "Only GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT is supported"); // Checks performed if the dst stage needs to be fused - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // If a_offset == 0, vector_sum_col can be a nullptr - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != expected_dst_shape[0]); } // If b_offset == 0, vector_sum_row can be a nullptr - if(gemm_info.b_offset != 0) + if (gemm_info.b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if mm result is a 3D reinterpretation - const bool reinterpret_as_3d = expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + expected_dst_shape.num_dimensions() > 1 && expected_dst_shape.y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (expected_dst_shape[1] * expected_dst_shape[2])); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (expected_dst_shape[1] * expected_dst_shape[2])); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != expected_dst_shape[1]); - if(expected_dst_shape.num_dimensions() > 1) + if (expected_dst_shape.num_dimensions() > 1) { const unsigned int dst_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -147,30 +161,32 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != collapsed_dst_shape[dst_batch_idx], "vector_sum_row must have the same number of batches of dst tensor"); - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - if(output_multipliers != nullptr && output_shifts != nullptr) + if (output_multipliers != nullptr && output_shifts != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_multipliers, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) + if (output_stage.is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_shifts->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(expected_dst_shape[0] != output_multipliers->dimension(0)); @@ -180,9 +196,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts, ElementsProcessed &num_elements_processed) +std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts, + ElementsProcessed &num_elements_processed) { const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -200,9 +223,10 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, reinterpret_output_as_3d = false; // dst tensor auto initialization if not yet initialized const TensorShape expected_dst_shape = compute_mm_shape(*src0, *src1, gemm_info); - if(output_stage.type != GEMMLowpOutputStageType::NONE) + if (output_stage.type != GEMMLowpOutputStageType::NONE) { - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(expected_dst_shape).set_data_type(output_stage.output_data_type)); } else { @@ -211,7 +235,7 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -224,11 +248,12 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, num_elems_processed_per_iteration_x = 1; num_elems_processed_per_iteration_y = 1; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - if(gemm_info.a_offset != 0) + if (gemm_info.a_offset != 0) { AccessWindowHorizontal vector_sum_col_access(vector_sum_col, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win, vector_sum_col_access); @@ -236,17 +261,19 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, // No access window needed for vector_sum_row ARM_COMPUTE_UNUSED(vector_sum_row); - if(bias != nullptr) + if (bias != nullptr) { AccessWindowHorizontal bias_access(bias, 0, num_elems_processed_per_iteration_x); window_changed = window_changed || update_window_and_padding(win, bias_access); } - if(output_multipliers != nullptr && output_stage.is_quantized_per_channel) + if (output_multipliers != nullptr && output_stage.is_quantized_per_channel) { - AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, num_elems_processed_per_iteration_x); + AccessWindowHorizontal output_multipliers_access(output_multipliers, 0, + num_elems_processed_per_iteration_x); AccessWindowHorizontal output_shifts_access(output_shifts, 0, num_elems_processed_per_iteration_x); - window_changed = window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access); + window_changed = + window_changed || update_window_and_padding(win, output_multipliers_access, output_shifts_access); } } @@ -278,7 +305,8 @@ std::pair<Status, Window> validate_and_configure_window(const ITensorInfo *src0, collapsed.set(Window::DimX, x_dimension); collapsed.set(Window::DimY, y_dimension); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -288,15 +316,22 @@ ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmLowpMatrixMultiplyResha _type = CLKernelType::GEMM; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, - const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, ITensorInfo *bias, - ITensorInfo *output_multipliers, ITensorInfo *output_shifts) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + ITensorInfo *bias, + ITensorInfo *output_multipliers, + ITensorInfo *output_shifts) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); - auto padding_info = get_padding_info({ src0, src1, dst, vector_sum_row }); + auto padding_info = get_padding_info({src0, src1, dst, vector_sum_row}); const GEMMRHSMatrixInfo rhs_info = gemm_info.rhs_info; const GEMMLHSMatrixInfo lhs_info = gemm_info.lhs_info; const GEMMLowpOutputStageInfo output_stage = gemm_info.output_stage; @@ -313,7 +348,8 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -334,18 +370,19 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0)); build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0)); build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0)); - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option("-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemmlowp_mm_reshaped_only_rhs_mmul"); - if(output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { build_opts.add_option("-DFUSED_OUTPUT_STAGE_FIXED_POINT"); _fuse_output_stage = true; // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0 && vector_sum_col != nullptr) + if (a_offset != 0 && vector_sum_col != nullptr) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -396,42 +433,54 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompil ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ElementsProcessed num_elements_processed{}; - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, output_multipliers, output_shifts)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - dst->clone().get(), - gemm_info, - vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, - vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, - bias != nullptr ? bias->clone().get() : nullptr, - output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, - output_shifts != nullptr ? output_shifts->clone().get() : nullptr, - num_elements_processed) - .first); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, dst, gemm_info, vector_sum_col, vector_sum_row, bias, + output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src0->clone().get(), src1->clone().get(), dst->clone().get(), gemm_info, + vector_sum_col != nullptr ? vector_sum_col->clone().get() : nullptr, + vector_sum_row != nullptr ? vector_sum_row->clone().get() : nullptr, + bias != nullptr ? bias->clone().get() : nullptr, + output_multipliers != nullptr ? output_multipliers->clone().get() : nullptr, + output_shifts != nullptr ? output_shifts->clone().get() : nullptr, + num_elements_processed) + .first); return Status{}; } -void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -449,7 +498,7 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens add_3d_tensor_nhw_argument(idx, src1); // Bias buffer (_add_bias == true) - if(src2 != nullptr) + if (src2 != nullptr) { add_3d_tensor_nhw_argument(idx, src2); } @@ -461,21 +510,20 @@ void ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tens _kernel.setArg<cl_int>(idx++, _n); _kernel.setArg<cl_int>(idx++, _k); - if(_fuse_output_stage) + if (_fuse_output_stage) { - if(vector_sum_col != nullptr) + if (vector_sum_col != nullptr) { add_3d_tensor_nhw_argument(idx, vector_sum_col); } - if(vector_sum_row != nullptr) + if (vector_sum_row != nullptr) { add_3d_tensor_nhw_argument(idx, vector_sum_row); } } enqueue(queue, *this, slice, cl::NDRange(32, 2), false); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h index 0ae549cd53..fc8b73140d 100644 --- a/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/IClKernel.h" @@ -65,29 +66,42 @@ public: * @param[in] output_multipliers (Optional) Output multipliers tensor. Supported data types: S32. * @param[in] output_shifts (Optional) Output shifts tensor. Supported data types: S32. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, ITensorInfo *bias = nullptr, - ITensorInfo *output_multipliers = nullptr, ITensorInfo *output_shifts = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + ITensorInfo *bias = nullptr, + ITensorInfo *output_multipliers = nullptr, + ITensorInfo *output_shifts = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *dst, const GEMMKernelInfo &gemm_info, - const ITensorInfo *vector_sum_col = nullptr, const ITensorInfo *vector_sum_row = nullptr, const ITensorInfo *bias = nullptr, - const ITensorInfo *output_multipliers = nullptr, const ITensorInfo *output_shifts = nullptr); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *dst, + const GEMMKernelInfo &gemm_info, + const ITensorInfo *vector_sum_col = nullptr, + const ITensorInfo *vector_sum_row = nullptr, + const ITensorInfo *bias = nullptr, + const ITensorInfo *output_multipliers = nullptr, + const ITensorInfo *output_shifts = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _fuse_output_stage{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _fuse_output_stage{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMMLOWP_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMULKERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp index 9ec0b5182f..d93dbde95a 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.cpp @@ -28,11 +28,10 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -44,12 +43,16 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t a_offset, + int32_t b_offset) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -57,26 +60,28 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -87,13 +92,15 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } @@ -108,29 +115,34 @@ ClGemmLowpOffsetContributionKernel::ClGemmLowpOffsetContributionKernel() } void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t k, int32_t a_offset, int32_t b_offset) + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t k, + int32_t a_offset, + int32_t b_offset) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias }); + auto padding_info = get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias}); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->num_dimensions() > 1 - && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 && + mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(4, mm_result->dimension(0)); // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -138,8 +150,10 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi // If b_offset == 0, vector_sum_row can be a nullptr build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(reinterpret_as_3d, + "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, + "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); std::string kernel_name("gemmlowp_offset_contribution"); @@ -165,10 +179,15 @@ void ClGemmLowpOffsetContributionKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t a_offset, int32_t b_offset) +Status ClGemmLowpOffsetContributionKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t a_offset, + int32_t b_offset) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, a_offset, b_offset)); return Status{}; } @@ -177,10 +196,13 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IClKernel::window(), window); - const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto bias = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto mm_result = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_SRC_DST)); Window collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); @@ -209,8 +231,7 @@ void ClGemmLowpOffsetContributionKernel::run_op(ITensorPack &tensors, const Wind add_1D_tensor_argument_if((bias != nullptr), idx, bias, biases_slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h index 48926e280b..2080a3a091 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionKernel.h @@ -67,15 +67,25 @@ public: * @param[in] b_offset Offset to be added to each element of the matrix B. */ void configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - int32_t k, int32_t a_offset, int32_t b_offset); + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t k, + int32_t a_offset, + int32_t b_offset); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpOffsetContributionKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, int32_t a_offset, int32_t b_offset); + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + int32_t a_offset, + int32_t b_offset); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp index c5fb54f524..26f479f61a 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.cpp @@ -34,7 +34,6 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -46,12 +45,20 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, - int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status validate_arguments(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(mm_result, 1, DataType::S32); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -62,33 +69,35 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON(output_multipliers->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output_shifts, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(output_shifts->num_dimensions() > 1); - if(output_stage.is_quantized_per_channel) + if (output_stage.is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_shifts->dimension(0)); ARM_COMPUTE_RETURN_ERROR_ON(mm_result->dimension(0) != output_multipliers->dimension(0)); } // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_col, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(vector_sum_col->dimension(0) != mm_result->dimension(0)); } // If b_offset == 0, vector_sum_row can be a nullptr - if(b_offset != 0) + if (b_offset != 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(vector_sum_row, 1, DataType::S32); // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = + mm_result->num_dimensions() > 1 && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Validate input - ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != (mm_result->dimension(1) * mm_result->dimension(2))); + ARM_COMPUTE_RETURN_ERROR_ON(reinterpret_as_3d && vector_sum_row->dimension(0) != + (mm_result->dimension(1) * mm_result->dimension(2))); ARM_COMPUTE_RETURN_ERROR_ON(!reinterpret_as_3d && vector_sum_row->dimension(0) != mm_result->dimension(1)); TensorShape output_shape = mm_result->tensor_shape(); - if(output_shape.num_dimensions() > 1) + if (output_shape.num_dimensions() > 1) { const unsigned int output_batch_idx = reinterpret_as_3d ? 3 : 2; @@ -99,20 +108,22 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_row_shape[1] != output_shape[output_batch_idx], "mm_result tensor must have the same number of batches of output tensor"); - if(a_offset != 0) + if (a_offset != 0) { TensorShape vector_sum_col_shape = vector_sum_col->tensor_shape(); vector_sum_col_shape.collapse_from(1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && vector_sum_col_shape[1] != vector_sum_row_shape[1], - "vector_sum_col tensor must have the same number of batches of vector_sum_row_shape or the number of batches must be set to 1"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(vector_sum_col_shape[1] != 1 && + vector_sum_col_shape[1] != vector_sum_row_shape[1], + "vector_sum_col tensor must have the same number of batches of " + "vector_sum_row_shape or the number of batches must be set to 1"); } } } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.type == GEMMLowpOutputStageType::NONE); // Checks performed when output is configured - if((dst != nullptr) && (dst->total_size() != 0)) + if ((dst != nullptr) && (dst->total_size() != 0)) { ARM_COMPUTE_RETURN_ERROR_ON(output_stage.output_data_type != dst->data_type()); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); @@ -120,7 +131,8 @@ Status validate_arguments(const ITensorInfo *mm_result, const ITensorInfo *vecto } ARM_COMPUTE_RETURN_ERROR_ON(output_stage.gemmlowp_min_bound > output_stage.gemmlowp_max_bound); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), "per channel quantization info is incorrect"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(output_stage.gemmlowp_multipliers.size() != output_stage.gemmlowp_shifts.size(), + "per channel quantization info is incorrect"); return Status{}; } @@ -131,16 +143,26 @@ ClGemmLowpOffsetContributionOutputStageKernel::ClGemmLowpOffsetContributionOutpu _type = CLKernelType::ELEMENTWISE; } -void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mm_result, dst, output_multipliers, output_shifts); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, + b_offset, output_stage, output_multipliers, output_shifts)); - auto padding_info = get_padding_info({ mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts }); + auto padding_info = + get_padding_info({mm_result, vector_sum_col, vector_sum_row, bias, dst, output_multipliers, output_shifts}); const int min = output_stage.gemmlowp_min_bound; const int max = output_stage.gemmlowp_max_bound; @@ -148,9 +170,8 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon _is_quantized_per_channel = output_stage.is_quantized_per_channel; // Check if input is a 3D reinterpretation - const bool reinterpret_as_3d = vector_sum_row != nullptr - && mm_result->num_dimensions() > 1 - && mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); + const bool reinterpret_as_3d = vector_sum_row != nullptr && mm_result->num_dimensions() > 1 && + mm_result->tensor_shape().y() != vector_sum_row->tensor_shape().x(); // Auto initialize the output auto_init_if_empty(*dst, mm_result->clone()->set_data_type(output_stage.output_data_type)); @@ -160,10 +181,11 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(mm_result->dimension(0) % num_elems_processed_per_iteration)); // If a_offset == 0, vector_sum_col can be a nullptr - if(a_offset != 0) + if (a_offset != 0) { build_opts.add_option("-DA_OFFSET=" + support::cpp11::to_string(a_offset)); build_opts.add_option_if(vector_sum_col->tensor_shape().num_dimensions() > 1, "-DSUM_COL_HAS_BATCHES"); @@ -171,8 +193,10 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon // If b_offset == 0, vector_sum_row can be a nullptr build_opts.add_option_if(b_offset != 0, "-DB_OFFSET=" + support::cpp11::to_string(b_offset)); build_opts.add_option("-DK_OFFSET=" + support::cpp11::to_string(a_offset * b_offset * k)); - build_opts.add_option_if(reinterpret_as_3d, "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); - build_opts.add_option_if(reinterpret_as_3d, "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); + build_opts.add_option_if(reinterpret_as_3d, + "-DHEIGHT_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(1))); + build_opts.add_option_if(reinterpret_as_3d, + "-DDEPTH_INPUT3D=" + support::cpp11::to_string(mm_result->dimension(2))); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage.gemmlowp_offset)); build_opts.add_option("-DRESULT_MULTIPLIER=" + support::cpp11::to_string(output_stage.gemmlowp_multipliers[0])); @@ -210,26 +234,42 @@ void ClGemmLowpOffsetContributionOutputStageKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, - const ITensorInfo *dst, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts) +Status ClGemmLowpOffsetContributionOutputStageKernel::validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, b_offset, output_stage, output_multipliers, output_shifts)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(mm_result, vector_sum_col, vector_sum_row, bias, dst, a_offset, + b_offset, output_stage, output_multipliers, output_shifts)); return Status{}; } -void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto mm_result = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); - const auto vector_sum_col = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); - const auto vector_sum_row = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); - const auto output_shifts = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); - const auto output_multipliers = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto mm_result = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + const auto vector_sum_col = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_COL_SUM)); + const auto vector_sum_row = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_VEC_ROW_SUM)); + const auto output_shifts = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SHIFTS)); + const auto output_multipliers = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_MULTIPLIERS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); @@ -260,8 +300,7 @@ void ClGemmLowpOffsetContributionOutputStageKernel::run_op(ITensorPack &tensors, add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_multipliers, biases_slice); add_1D_tensor_argument_if(_is_quantized_per_channel, idx, output_shifts, biases_slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h index cee04473c4..97ee9bc97f 100644 --- a/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpOffsetContributionOutputStageKernel.h @@ -66,23 +66,40 @@ public: * @param[in] output_shifts Output shifts tensor. In case of per-channel quantization, the number of multipliers must be equal to the number of filters (OFM). * Supported data types: S32 */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, ITensorInfo *dst, - int32_t k, int32_t a_offset, int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, - const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + ITensorInfo *dst, + int32_t k, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpOffsetContributionOutputStageKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mm_result, const ITensorInfo *vector_sum_col, const ITensorInfo *vector_sum_row, const ITensorInfo *bias, const ITensorInfo *dst, int32_t a_offset, - int32_t b_offset, const GEMMLowpOutputStageInfo &output_stage, const ITensorInfo *output_multipliers, const ITensorInfo *output_shifts); + static Status validate(const ITensorInfo *mm_result, + const ITensorInfo *vector_sum_col, + const ITensorInfo *vector_sum_row, + const ITensorInfo *bias, + const ITensorInfo *dst, + int32_t a_offset, + int32_t b_offset, + const GEMMLowpOutputStageInfo &output_stage, + const ITensorInfo *output_multipliers, + const ITensorInfo *output_shifts); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _is_quantized_per_channel{ false }; + bool _is_quantized_per_channel{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp index 39754385a1..7b7beab12c 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.cpp @@ -27,15 +27,14 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -47,20 +46,23 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching dst data type"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -75,7 +77,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::ClGemmLowpQuantizeDownInt32S _type = CLKernelType::ELEMENTWISE; } -Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -84,14 +88,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(const ITenso return Status{}; } -void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); @@ -103,19 +110,26 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi auto max = info->gemmlowp_max_bound; CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DRESULT_OFFSET_AFTER_SHIFT=" + support::cpp11::to_string(info->gemmlowp_offset)); build_opts.add_option("-DRESULT_FIXEDPOINT_MULTIPLIER=" + support::cpp11::to_string(info->gemmlowp_multiplier)); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(info->gemmlowp_shift)); build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && (min != max), - "-DMAX_BOUND=" + support::cpp11::to_string(max)); + build_opts.add_option_if( + (min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && + (min != max), + "-DMIN_BOUND=" + support::cpp11::to_string(min)); + build_opts.add_option_if( + (max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))) && + (min != max), + "-DMAX_BOUND=" + support::cpp11::to_string(max)); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); // Create kernel - const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" : "gemmlowp_output_stage_quantize_down_fixedpoint"; + const std::string kernel_name = (info->output_data_type == DataType::QSYMM16) + ? "gemmlowp_output_stage_quantize_down_fixedpoint_qsymm16" + : "gemmlowp_output_stage_quantize_down_fixedpoint"; // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -129,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure(const CLCompi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); // Create src window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -144,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten // Setup bias slice unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) + if (bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -158,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::run_op(ITensorPack &ten add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h index 69b5fc5018..71c9f4b752 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h @@ -60,14 +60,21 @@ public: * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED/QSYMM16. * @param[in] info Output stage info. Used to pass the quantized output data type */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp index f379698326..52ebd32d46 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.cpp @@ -27,15 +27,14 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -47,23 +46,31 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && (info->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(info->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) - || info->gemmlowp_min_bound > info->gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON((info->output_data_type != DataType::QASYMM8) && + (info->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON( + info->gemmlowp_max_bound > + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON( + info->gemmlowp_min_bound < + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(info->output_data_type)) || + info->gemmlowp_min_bound > info->gemmlowp_max_bound); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != info->output_data_type, "Mismatching output data type"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); @@ -78,7 +85,9 @@ ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::ClGemmLowpQuantizeDownInt32ScaleB _type = CLKernelType::ELEMENTWISE; } -Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, +Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -87,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::validate(const ITensorInfo return Status{}; } -void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const GEMMLowpOutputStageInfo *info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, info)); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); // Output auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(info->output_data_type)); @@ -107,7 +119,8 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DREAL_MULTIPLIER=" + float_to_string_with_full_precision(info->gemmlowp_real_multiplier)); build_opts.add_option("-DOUTPUT_OFFSET=" + support::cpp11::to_string(info->gemmlowp_offset)); build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); @@ -130,14 +143,18 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); // Create input window Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); @@ -145,7 +162,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, // Setup bias slice unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) + if (bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -159,8 +176,7 @@ void ClGemmLowpQuantizeDownInt32ScaleByFloatKernel::run_op(ITensorPack &tensors, add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h index 8eda24d25f..057c66767f 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h @@ -62,14 +62,21 @@ public: * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] info Output stage info. Used to pass the quantized output data type */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp index 5d54db214a..31434ce61b 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.cpp @@ -26,15 +26,14 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Helpers.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -46,25 +45,34 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_max_bound > std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); - ARM_COMPUTE_RETURN_ERROR_ON(output_stage->gemmlowp_min_bound < std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) - || output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); + ARM_COMPUTE_RETURN_ERROR_ON((output_stage->output_data_type != DataType::QASYMM8) && + (output_stage->output_data_type != DataType::QASYMM8_SIGNED)); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_max_bound > + std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))); + ARM_COMPUTE_RETURN_ERROR_ON( + output_stage->gemmlowp_min_bound < + std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type)) || + output_stage->gemmlowp_min_bound > output_stage->gemmlowp_max_bound); // Check biases if exist - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, bias); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != bias->dimension(0)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, "Mismatching output data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() != output_stage->output_data_type, + "Mismatching output data type"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); } @@ -77,7 +85,10 @@ ClGemmLowpQuantizeDownInt32ScaleKernel::ClGemmLowpQuantizeDownInt32ScaleKernel() _type = CLKernelType::ELEMENTWISE; } -Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) +Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, bias, dst, output_stage)); @@ -85,14 +96,17 @@ Status ClGemmLowpQuantizeDownInt32ScaleKernel::validate(const ITensorInfo *src, return Status{}; } -void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, +void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, output_stage)); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); // Output auto inizialitation if not yet initialized auto_init_if_empty(*dst, src->clone()->set_data_type(output_stage->output_data_type)); @@ -104,13 +118,18 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::configure(const CLCompileContext &c auto max = output_stage->gemmlowp_max_bound; CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DRESULT_OFFSET=" + support::cpp11::to_string(output_stage->gemmlowp_offset)); build_opts.add_option("-DRESULT_MULT_INT=" + support::cpp11::to_string(output_stage->gemmlowp_multiplier)); build_opts.add_option("-DRESULT_SHIFT=" + support::cpp11::to_string(output_stage->gemmlowp_shift)); - build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + build_opts.add_option_if((min > std::get<0>(quantization::get_min_max_values_from_quantized_data_type( + output_stage->output_data_type))) && + (min != max), "-DMIN_BOUND=" + support::cpp11::to_string(min)); - build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type(output_stage->output_data_type))) && (min != max), + build_opts.add_option_if((max < std::get<1>(quantization::get_min_max_values_from_quantized_data_type( + output_stage->output_data_type))) && + (min != max), "-DMAX_BOUND=" + support::cpp11::to_string(max)); build_opts.add_option("-DOUTPUT_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); build_opts.add_option_if(bias != nullptr, "-DADD_BIAS"); @@ -135,15 +154,17 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - const auto bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + const auto bias = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_BIAS)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = collapsed.first_slice_window_3D(); unsigned int idx1 = num_arguments_per_3D_tensor(); - if(bias != nullptr) + if (bias != nullptr) { Window biases_slice(slice); biases_slice.set(Window::DimY, Window::Dimension(0, 1, 1)); @@ -157,8 +178,7 @@ void ClGemmLowpQuantizeDownInt32ScaleKernel::run_op(ITensorPack &tensors, const add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx1, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h index 84c5060362..e6390801f1 100644 --- a/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h @@ -62,14 +62,21 @@ public: * @param[out] dst Destination tensor. Data type supported: Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] output_stage GEMMLowp output stage metadata. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo *output_stage); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo *output_stage); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -77,4 +84,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMMLOWP_QUANTIZEDOWN_INT32_SCALE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp index ea88b485a0..ee4a191fed 100644 --- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.cpp @@ -32,7 +32,6 @@ #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -47,12 +46,15 @@ namespace Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(1), "Output vector must have length equal to the number of rows of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(1), + "Output vector must have length equal to the number of rows of the input matrix"); } return Status{}; } @@ -60,12 +62,15 @@ Status validate_arguments_matrix_a_reduction(const ITensorInfo *src, const ITens Status validate_arguments_matrix_b_reduction(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->dimension(0) != src->dimension(0), "Output vector must have length equal to the number of columns of the input matrix"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->dimension(0) != src->dimension(0), + "Output vector must have length equal to the number of columns of the input matrix"); } return Status{}; } @@ -76,7 +81,10 @@ IClGemmLowpReductionKernel::IClGemmLowpReductionKernel() _type = CLKernelType::ELEMENTWISE; } -void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_a, + ITensorInfo *vector_sum_row, + const GEMMLowpReductionKernelInfo &info) { // Perform validate step ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_a, vector_sum_row); @@ -85,7 +93,7 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile // Output auto initialization if not yet initialized auto_init_if_empty(*vector_sum_row, TensorShape(mtx_a->dimension(1)), 1, DataType::S32); - auto padding_info = get_padding_info({ mtx_a, vector_sum_row }); + auto padding_info = get_padding_info({mtx_a, vector_sum_row}); // Set the arguments to pass at compile time CLBuildOptions build_opts; @@ -120,7 +128,9 @@ void ClGemmLowpMatrixAReductionKernel::configure(const CLCompileContext &compile ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) +Status ClGemmLowpMatrixAReductionKernel::validate(const ITensorInfo *mtx_a, + const ITensorInfo *vector_sum_row, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_a_reduction(mtx_a, vector_sum_row)); @@ -133,8 +143,9 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimY); Window slice_in = collapsed.first_slice_window_2D(); @@ -151,11 +162,13 @@ void ClGemmLowpMatrixAReductionKernel::run_op(ITensorPack &tensors, const Window add_3D_tensor_argument(idx, src, slice_in); add_2D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); + } while (collapsed.slide_window_slice_2D(slice_out)); } -void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_b, + ITensorInfo *vector_sum_col, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(mtx_b, vector_sum_col); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); @@ -163,14 +176,15 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile // Output auto initialization if not yet initialized auto_init_if_empty(*vector_sum_col, TensorShape(mtx_b->dimension(0)), 1, DataType::S32); - auto padding_info = get_padding_info({ mtx_b, vector_sum_col }); + auto padding_info = get_padding_info({mtx_b, vector_sum_col}); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, mtx_b->dimension(0)); // Set the arguments to pass at compile time CLBuildOptions build_opts; build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(mtx_b->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DCOLS_B=" + support::cpp11::to_string(mtx_b->dimension(0))); build_opts.add_option("-DROWS_B=" + support::cpp11::to_string(mtx_b->dimension(1))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(mtx_b->data_type())); @@ -192,7 +206,9 @@ void ClGemmLowpMatrixBReductionKernel::configure(const CLCompileContext &compile ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) +Status ClGemmLowpMatrixBReductionKernel::validate(const ITensorInfo *mtx_b, + const ITensorInfo *vector_sum_col, + const GEMMLowpReductionKernelInfo &info) { ARM_COMPUTE_UNUSED(info); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_matrix_b_reduction(mtx_b, vector_sum_col)); @@ -205,8 +221,9 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window collapsed = window.collapse_if_possible(IKernel::window(), Window::DimY); @@ -222,8 +239,7 @@ void ClGemmLowpMatrixBReductionKernel::run_op(ITensorPack &tensors, const Window add_3D_tensor_argument(idx, src, slice_in); add_2D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_out, lws_hint()); - } - while(collapsed.slide_window_slice_2D(slice_out)); + } while (collapsed.slide_window_slice_2D(slice_out)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h index 7119b5fee0..c81543e4c2 100644 --- a/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h +++ b/src/gpu/cl/kernels/ClGemmLowpReductionKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMMLOWP_REDUCTION_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -52,7 +53,10 @@ public: * - scalar Scalar value to multiply each reduced column/row by. * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. */ - virtual void configure(const CLCompileContext &compile_context, const ITensorInfo *input, ITensorInfo *output, const GEMMLowpReductionKernelInfo &info) = 0; + virtual void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + ITensorInfo *output, + const GEMMLowpReductionKernelInfo &info) = 0; }; /** OpenCL kernel used to compute the row-vectors of sums of all the entries in each row of Matrix A. @@ -74,14 +78,18 @@ public: * - scalar Scalar value to multiply each reduced column/row by. * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_a, ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info) override; + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_a, + ITensorInfo *vector_sum_row, + const GEMMLowpReductionKernelInfo &info) override; /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); + static Status + validate(const ITensorInfo *mtx_a, const ITensorInfo *vector_sum_row, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -106,14 +114,18 @@ public: * - scalar Scalar value to multiply each reduced column/row by. * - mul_byscalar True if each reduced column/row must be multiplied by a scalar value. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *mtx_b, ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info) override; + void configure(const CLCompileContext &compile_context, + const ITensorInfo *mtx_b, + ITensorInfo *vector_sum_col, + const GEMMLowpReductionKernelInfo &info) override; /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); + static Status + validate(const ITensorInfo *mtx_b, const ITensorInfo *vector_sum_col, const GEMMLowpReductionKernelInfo &info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp index b8997dfc7f..fd23aa9924 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.cpp @@ -29,10 +29,11 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLUtils.h" #include "src/core/helpers/AutoConfiguration.h" @@ -51,7 +52,13 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { @@ -59,15 +66,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 1 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) && + (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.export_to_cl_image, "Export to CLImage not supported for GEMM native"); @@ -82,7 +94,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) != n); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(1) != k); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); } @@ -91,15 +103,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) != m); } - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -107,9 +120,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -117,9 +131,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) + const GEMMKernelInfo &gemm_info, + ElementsProcessed &num_elements_processed) { unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; unsigned int &num_elems_processed_per_iteration_y = num_elements_processed[1]; @@ -132,17 +151,18 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(reinterpret_input_as_3d == reinterpret_output_as_3d) + if (reinterpret_input_as_3d == reinterpret_output_as_3d) { reinterpret_output_as_3d = false; } // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -155,34 +175,34 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - win_out = calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + win_out = + calculate_max_window(*dst, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowStatic src0_access(src0, 0, 0, - src0->dimension(0), - src0->dimension(1)); - AccessWindowStatic src1_access(src1, 0, 0, - ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), - src1->dimension(1)); - AccessWindowStatic dst_access(dst, 0, 0, - dst->dimension(0), - dst->dimension(1)); + AccessWindowStatic src0_access(src0, 0, 0, src0->dimension(0), src0->dimension(1)); + AccessWindowStatic src1_access( + src1, 0, 0, ceil_to_multiple(src1->dimension(0), num_elems_processed_per_iteration_x), src1->dimension(1)); + AccessWindowStatic dst_access(dst, 0, 0, dst->dimension(0), dst->dimension(1)); - if(src2 != nullptr) + if (src2 != nullptr) { const int bias_processed_per_iteration_x = num_elems_processed_per_iteration_x; - AccessWindowStatic src2_access(src2, 0, 0, - ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), + AccessWindowStatic src2_access(src2, 0, 0, ceil_to_multiple(src2->dimension(0), bias_processed_per_iteration_x), src2->dimension(1)); - window_changed = update_window_and_padding(win, src0_access, src1_access, src2_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + window_changed = update_window_and_padding(win, src0_access, src1_access, + src2_access) || // window used by the execute_window_loop + update_window_and_padding( + win_out, dst_access); // window used to update the padding requirements of dst tensor } else { - window_changed = update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop - update_window_and_padding(win_out, dst_access); // window used to update the padding requirements of dst tensor + window_changed = + update_window_and_padding(win, src0_access, src1_access) || // window used by the execute_window_loop + update_window_and_padding(win_out, + dst_access); // window used to update the padding requirements of dst tensor } // Collapse along the Z direction @@ -191,7 +211,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens const unsigned int dimension_to_collapse = std::min(static_cast<unsigned int>(dst->num_dimensions()), 2u); collapsed = win.collapse(win, dimension_to_collapse); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -201,19 +222,26 @@ ClGemmMatrixMultiplyNativeKernel::ClGemmMatrixMultiplyNativeKernel() _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, +void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - auto padding_info = get_padding_info({ src0, dst }); + auto padding_info = get_padding_info({src0, dst}); _reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d; _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); @@ -221,7 +249,7 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if(_reinterpret_input_as_3d == _reinterpret_output_as_3d) + if (_reinterpret_input_as_3d == _reinterpret_output_as_3d) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -234,7 +262,8 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, rhs_info, gemm_info, num_elements_processed); + auto win_config = validate_and_configure_window(src0, src1, src2 != nullptr ? src2 : nullptr, dst, lhs_info, + rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -260,14 +289,17 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(_use_dummy_work_items, "-DDUMMY_WORK_ITEMS"); build_opts.add_option("-DM0=" + support::cpp11::to_string(internal_m0)); @@ -275,9 +307,13 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile build_opts.add_option("-DK0=" + support::cpp11::to_string(rhs_info.k0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemm_mm_native"); @@ -314,21 +350,23 @@ void ClGemmMatrixMultiplyNativeKernel::configure(const CLCompileContext &compile ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyNativeKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ElementsProcessed num_elements_processed{}; ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), src2 != nullptr ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, + dst->clone().get(), lhs_info, rhs_info, gemm_info, num_elements_processed) - .first); + .first); return Status{}; } @@ -338,15 +376,18 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -358,11 +399,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window slice_matrix_b.set(Window::DimX, Window::Dimension(0, 1, 1)); slice_matrix_b.set(Window::DimY, Window::Dimension(0, 1, 1)); - if(_reinterpret_input_as_3d) + if (_reinterpret_input_as_3d) { // Pass bottom paddings to the kernel if the input has to be reinterpreted as 3D tensor unsigned int idx0; - if(_add_bias) + if (_add_bias) { idx0 = 4 * num_arguments_per_2D_tensor() + 7; } @@ -374,11 +415,11 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg<cl_uint>(idx0, static_cast<unsigned int>(total_cross_plane_pad)); } - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { // Pass bottom paddings to the kernel if the dst has to be reinterpreted as 3D tensor unsigned int idx0; - if(_add_bias) + if (_add_bias) { idx0 = 4 * num_arguments_per_2D_tensor() + 7 + (_reinterpret_input_as_3d ? 1 : 0); } @@ -395,7 +436,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -403,7 +444,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window unsigned int idx = 0; add_2D_tensor_argument(idx, src0, slice); add_2D_tensor_argument(idx, src1, slice_b); - if(_add_bias) + if (_add_bias) { add_2D_tensor_argument(idx, src2, slice); } @@ -411,7 +452,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src0->info()->strides_in_bytes()[2])); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); - if(_add_bias) + if (_add_bias) { _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2])); } @@ -423,8 +464,7 @@ void ClGemmMatrixMultiplyNativeKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg<cl_int>(idx++, _k); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h index 80f8355932..da6c9a5bb7 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyNativeKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYNATIVEKERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -58,7 +59,13 @@ public: * rhs_info.k0: same of lhs_info.k0 * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -68,7 +75,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -76,14 +89,14 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _add_bias{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp index d72d29ea1e..4fe6bddb36 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.cpp @@ -29,10 +29,11 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -52,7 +53,13 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { @@ -61,42 +68,50 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 != rhs_info.k0); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.transpose == rhs_info.transpose); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), "Only 2,3,4,8,16 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), "Mixed precision only supported for F16 data type"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((lhs_info.transpose) && ((lhs_info.m0 & (lhs_info.m0 - 1)) && lhs_info.m0 != 3), + "Only 2,3,4,8,16 are supported for m0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.transpose) && ((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) && + (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision && (src0->data_type() == DataType::F32), + "Mixed precision only supported for F16 data type"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); const unsigned int m = gemm_info.m; const unsigned int n = gemm_info.n; const unsigned int k = gemm_info.k; - TensorShape tensor_shape0{ src0->tensor_shape() }; + TensorShape tensor_shape0{src0->tensor_shape()}; tensor_shape0.set(0, k); tensor_shape0.set(1, m); - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -107,15 +122,18 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons const TensorInfo tensor_info0 = src0->clone()->set_tensor_shape(tensor_shape0); const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped0 = src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info_reshaped0 = + src0->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(tensor_info0, lhs_info)); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src0, &tensor_info_reshaped0); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -123,9 +141,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, - const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) + const GEMMKernelInfo &gemm_info, + ElementsProcessed &num_elements_processed) { ARM_COMPUTE_UNUSED(src0, src1, src2); unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; @@ -134,7 +157,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -147,7 +170,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -164,18 +188,26 @@ ClGemmMatrixMultiplyReshapedKernel::ClGemmMatrixMultiplyReshapedKernel() _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - auto padding_info = get_padding_info({ src0, src1, src2, dst }); + auto padding_info = get_padding_info({src0, src1, src2, dst}); _reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; _use_dummy_work_items = preferred_dummy_work_items_support(CLKernelLibrary::get().get_device()); _add_bias = src2 != nullptr; @@ -188,14 +220,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi ElementsProcessed num_elements_processed{}; // Configure kernel window - auto win_config = validate_and_configure_window(src0->clone().get(), - src1->clone().get(), - (src2 != nullptr) ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info, - num_elements_processed); + auto win_config = validate_and_configure_window( + src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), + lhs_info, rhs_info, gemm_info, num_elements_processed); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); ICLKernel::configure_internal(win_config.second); @@ -213,12 +240,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi // Create build options CLBuildOptions build_opts; - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); - build_opts.add_option_if(_reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(dst->dimension(1))); + build_opts.add_option_if(_reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(dst->dimension(2))); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); build_opts.add_option_if(!_slide_matrix_b, "-DMATRIX_B_DEPTH=" + support::cpp11::to_string(src1->dimension(2))); build_opts.add_option_if(lhs_info.interleave, "-DLHS_INTERLEAVE"); @@ -229,7 +259,9 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option_if(rhs_info.export_to_cl_image, "-DOPENCL_IMAGE_SUPPORT"); build_opts.add_option("-DRHS_HEIGHT=" + support::cpp11::to_string(src1->dimension(1))); build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); - build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision ? get_cl_type_from_data_type(DataType::F32) : get_cl_type_from_data_type(data_type))); + build_opts.add_option("-DDATA_TYPE_ACCUMULATOR=" + (enable_mixed_precision + ? get_cl_type_from_data_type(DataType::F32) + : get_cl_type_from_data_type(data_type))); build_opts.add_option("-DM0=" + support::cpp11::to_string(lhs_info.m0)); build_opts.add_option("-DN0=" + support::cpp11::to_string(rhs_info.n0)); build_opts.add_option("-DK0=" + support::cpp11::to_string(lhs_info.k0)); @@ -237,9 +269,13 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemm_mm_reshaped_"); kernel_name += lhs_info.transpose ? "lhs_t_" : "lhs_nt_"; @@ -287,9 +323,15 @@ void ClGemmMatrixMultiplyReshapedKernel::configure(const CLCompileContext &compi ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyReshapedKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); return Status{}; @@ -300,15 +342,18 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -324,12 +369,14 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind cl::Image2D src1_image2d; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const TensorShape shape2d(src1->info()->dimension(0) / 4, + src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, + src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } do @@ -337,7 +384,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -348,7 +395,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind add_2D_tensor_argument(idx, src0, slice); // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, src1_image2d); } @@ -370,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[2])); // Bias stride_z (if _add_bias == true) - if(_add_bias) + if (_add_bias) { _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[2])); } @@ -379,7 +426,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[2])); // Cross-plan padding (if _reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d) + if (_reinterpret_output_as_3d) { _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad)); } @@ -393,8 +440,7 @@ void ClGemmMatrixMultiplyReshapedKernel::run_op(ITensorPack &tensors, const Wind // Dispatch kernel enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h index 8d25412a40..30928c4e1d 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedKernel.h @@ -24,12 +24,12 @@ #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDKERNEL_H +#include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" -#include "arm_compute/core/KernelDescriptors.h" - namespace arm_compute { namespace opencl @@ -83,16 +83,29 @@ public: * * @note lhs_info.k0 must be equal to rhs_info.k0 */ - void configure(const ClCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmMatrixMultiplyReshapedKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -100,14 +113,14 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _slide_matrix_b{true}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _add_bias{false}; + bool _export_to_cl_image{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp index b34c17cda8..1b19f1ec5b 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.cpp @@ -25,8 +25,9 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,24 +47,36 @@ namespace { using ElementsProcessed = Steps; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src0); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1 || lhs_info.m0 > 8, "Only 1,2,3,4,5,6,7,8 are supported for m0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16 || rhs_info.k0 < 2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && rhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16 || rhs_info.n0 < 2); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) - && (!gemm_info.broadcast_bias), - "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (gemm_info.reinterpret_input_as_3d || gemm_info.depth_output_gemm3d != 0) && (src2 != nullptr) && + (!gemm_info.broadcast_bias), + "Bias addition only supported with broadcast mode in case the input or dst has to be reinterpreted as 3D"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); @@ -71,19 +84,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons const unsigned int n = gemm_info.n; const unsigned int k = gemm_info.k; - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src0); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -93,10 +107,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); - if(gemm_info.reinterpret_input_as_3d) + if (gemm_info.reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); } @@ -106,9 +121,10 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -116,8 +132,14 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info, ElementsProcessed &num_elements_processed) +Window validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info, + ElementsProcessed &num_elements_processed) { ARM_COMPUTE_UNUSED(src0, src1, src2); unsigned int &num_elems_processed_per_iteration_x = num_elements_processed[0]; @@ -128,14 +150,14 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. // This approach should only be used when the input/dst tensors have pad on the y direction - if((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) + if ((reinterpret_input_as_3d == reinterpret_output_as_3d) && gemm_info.has_pad_y) { reinterpret_output_as_3d = false; } TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -148,7 +170,8 @@ Window validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITens num_elems_processed_per_iteration_x = rhs_info.n0; num_elems_processed_per_iteration_y = lhs_info.m0; - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -164,14 +187,22 @@ ClGemmMatrixMultiplyReshapedOnlyRhsKernel::ClGemmMatrixMultiplyReshapedOnlyRhsKe _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); @@ -182,11 +213,11 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext _export_to_cl_image = rhs_info.export_to_cl_image; _has_pad_y = gemm_info.has_pad_y; - auto padding_info = get_padding_info({ src0, src1, src2, dst }); + auto padding_info = get_padding_info({src0, src1, src2, dst}); // In case both input and dst have to be reinterpreted as 3D tensors, // force reinterpret_input_as_3d and reinterpret_output_as_3d to be false. - if((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) + if ((_reinterpret_input_as_3d == _reinterpret_output_as_3d) && _has_pad_y) { _reinterpret_input_as_3d = false; _reinterpret_output_as_3d = false; @@ -199,8 +230,9 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext ElementsProcessed num_elements_processed{}; // Configure kernel window - Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), lhs_info, rhs_info, gemm_info, - num_elements_processed); + Window win = validate_and_configure_window(src0->clone().get(), src1->clone().get(), + (src2 != nullptr) ? src2->clone().get() : nullptr, dst->clone().get(), + lhs_info, rhs_info, gemm_info, num_elements_processed); ICLKernel::configure_internal(win); // If _reinterpret_input_as_3d = reinterpret_output_as_3d = true, @@ -225,7 +257,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); @@ -240,17 +273,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext build_opts.add_option("-DH0=" + support::cpp11::to_string(rhs_info.h0)); build_opts.add_option("-DPARTIAL_STORE_M0=" + support::cpp11::to_string(partial_store_m0)); build_opts.add_option("-DPARTIAL_STORE_N0=" + support::cpp11::to_string(partial_store_n0)); - if(_has_pad_y) + if (_has_pad_y) { build_opts.add_option_if(_reinterpret_input_as_3d, "-DREINTERPRET_INPUT_AS_3D"); build_opts.add_option_if(_reinterpret_output_as_3d, "-DREINTERPRET_OUTPUT_AS_3D"); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); - build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DHEIGHT_GEMM3D=" + support::cpp11::to_string(h_gemm_3d)); + build_opts.add_option_if(_reinterpret_input_as_3d || _reinterpret_output_as_3d, + "-DDEPTH_GEMM3D=" + support::cpp11::to_string(d_gemm_3d)); } - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); - build_opts.add_option_if(gemm_info.activation_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); + build_opts.add_option_if(gemm_info.activation_info.enabled(), + "-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); std::string kernel_name("gemm_mm_reshaped_only_rhs_"); kernel_name += rhs_info.transpose ? "t" : "nt"; @@ -294,28 +333,39 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure(const CLCompileContext ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); return Status{}; } -void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -341,12 +391,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con cl::Image2D src1_image2d; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const TensorShape shape2d(src1->info()->dimension(0) / 4, + src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, + src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } do @@ -354,7 +406,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con Window slice_b = slice; // Don't slice matrix B along the z dimension if matrix B has just 2 dimensions and matrix A more than 2 // This scenario can happen when the matrix multiplication is used to perform a convolution operation - if(!_slide_matrix_b) + if (!_slide_matrix_b) { slice_b = slice_matrix_b; } @@ -365,7 +417,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con add_2D_tensor_argument(idx, src0, slice); // RHS buffer or RHS OpenCL image (_export_to_cl_image == true) - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, src1_image2d); } @@ -387,22 +439,23 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src1->info()->strides_in_bytes()[rhs_idx_batch_size])); // Bias stride_z (if _add_bias == true) - if(_add_bias) + if (_add_bias) { - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size])); + _kernel.setArg<cl_uint>(idx++, + static_cast<unsigned int>(src2->info()->strides_in_bytes()[bia_idx_batch_size])); } // dst stride_z _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[out_idx_batch_size])); // Cross-plan padding (if _reinterpret_input_as_3d = true) - if(_reinterpret_input_as_3d && _has_pad_y) + if (_reinterpret_input_as_3d && _has_pad_y) { _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_lhs)); } // Cross-plan padding (if reinterpret_output_as_3d = true) - if(_reinterpret_output_as_3d && _has_pad_y) + if (_reinterpret_output_as_3d && _has_pad_y) { _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(total_cross_plane_pad_out)); } @@ -413,8 +466,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsKernel::run_op(ITensorPack &tensors, con _kernel.setArg<cl_int>(idx++, _k); enqueue(queue, *this, slice, lws_hint(), _use_dummy_work_items); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h index 471160c94b..e8fd78d476 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsKernel.h @@ -24,12 +24,12 @@ #ifndef ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H #define ACL_SRC_GPU_CL_KERNELS_CLGEMMMATRIXMULTIPLYRESHAPEDONLYRHSKERNEL_H +#include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" -#include "arm_compute/core/KernelDescriptors.h" - namespace arm_compute { namespace opencl @@ -74,32 +74,46 @@ public: * rhs_info.transpose: true,false * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const ClCompileContext &compile_context, - const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmMatrixMultiplyReshapedOnlyRhsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, - const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _slide_matrix_b{ true }; - bool _reinterpret_input_as_3d{ false }; - bool _reinterpret_output_as_3d{ false }; - bool _use_dummy_work_items{ false }; - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - bool _has_pad_y{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _slide_matrix_b{true}; + bool _reinterpret_input_as_3d{false}; + bool _reinterpret_output_as_3d{false}; + bool _use_dummy_work_items{false}; + bool _add_bias{false}; + bool _export_to_cl_image{false}; + bool _has_pad_y{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp index 734f8f9b4c..9a2a4890f3 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.cpp @@ -23,16 +23,17 @@ */ #include "src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLUtils.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -56,23 +57,36 @@ constexpr int mmul_m0 = 4; constexpr int mmul_n0 = 4; constexpr int mmul_k0 = 4; -Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, +Status validate_arguments(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src0, src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), + "The extension cl_arm_matrix_multiply is not supported on the target platform"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src0, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, src1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, "The number of dimensions for the LHS matrix must be <= 4"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, "The number of dimensions for the RHS matrix must be <= 3"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src0->num_dimensions() > 4, + "The number of dimensions for the LHS matrix must be <= 4"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(src1->num_dimensions() > 3, + "The number of dimensions for the RHS matrix must be <= 3"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_info.m0 < 1, "Only values greater than 0 are supported for m0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && rhs_info.n0 != 8 && rhs_info.n0 != 16, "Only 1,2,3,4,8, and 16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.n0 != 1 && rhs_info.n0 != 2 && rhs_info.n0 != 3 && rhs_info.n0 != 4 && + rhs_info.n0 != 8 && rhs_info.n0 != 16, + "Only 1,2,3,4,8, and 16 are supported for n0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.k0 != 1 || lhs_info.k0 != 1), "Only 1 is supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON_MSG((rhs_info.h0 != 4), "Only 4 is supported for h0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, "Only true is supported for interleave with mmul extension enabled"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, "Only false is supported for transpose with mmul extension enabled"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.interleave != true, + "Only true is supported for interleave with mmul extension enabled"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_info.transpose != false, + "Only false is supported for transpose with mmul extension enabled"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.fp_mixed_precision, "Mixed precision not supported"); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(*src1, rhs_info)); @@ -87,7 +101,7 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(0) != k); // Validate the reinterpreted-as-3D-case - if(gemm_info.depth_output_gemm3d != 0) + if (gemm_info.depth_output_gemm3d != 0) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(1) * src0->dimension(2) != m); } @@ -97,9 +111,9 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } // Validate the gemm-batched case - if(src1->num_dimensions() > 2) + if (src1->num_dimensions() > 2) { - if(gemm_info.depth_output_gemm3d != 0) + if (gemm_info.depth_output_gemm3d != 0) { ARM_COMPUTE_RETURN_ERROR_ON(src0->dimension(3) != src1->dimension(2)); } @@ -109,15 +123,16 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - if(src2 != nullptr && !(helpers::float_ops::is_zero(beta))) + if (src2 != nullptr && !(helpers::float_ops::is_zero(beta))) { const unsigned int src2_dim0 = src2->dimension(0); const unsigned int src2_dim1 = src2->dimension(1); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src2, src1); - if(gemm_info.broadcast_bias) + if (gemm_info.broadcast_bias) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), "Incorrect dimension of bias matrix which is to be broadcasted"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src2_dim1 != 1 || src2_dim0 != n), + "Incorrect dimension of bias matrix which is to be broadcasted"); } else { @@ -125,18 +140,20 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons } } - TensorShape tensor_shape1{ src1->tensor_shape() }; + TensorShape tensor_shape1{src1->tensor_shape()}; tensor_shape1.set(0, n); tensor_shape1.set(1, k); - const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); - const TensorInfo tensor_info_reshaped1 = src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); + const TensorInfo tensor_info1 = src1->clone()->set_tensor_shape(tensor_shape1); + const TensorInfo tensor_info_reshaped1 = + src1->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(tensor_info1, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src1, &tensor_info_reshaped1); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); + const TensorInfo tensor_info_dst = + dst->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src0, dst); } @@ -144,7 +161,11 @@ Status validate_arguments(const ITensorInfo *src0, const ITensorInfo *src1, cons return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) { @@ -152,11 +173,12 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src0, ITens bool reinterpret_output_as_3d = gemm_info.depth_output_gemm3d != 0; // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); TensorInfo tmp_info(*dst); - if(reinterpret_output_as_3d) + if (reinterpret_output_as_3d) { // Since the dst tensor has to be reinterpreted as 3D and the execute window is based on a 2D GEMM, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -204,19 +226,26 @@ ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::ClGemmMatrixMultiplyReshapedOnlyR _type = CLKernelType::GEMM; } -void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, +void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); + auto_init_if_empty( + *dst, src0->clone()->set_tensor_shape(misc::shape_calculator::compute_mm_shape(*src0, *src1, gemm_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - auto padding_info = get_padding_info({ src0, src1, src2, dst }); + auto padding_info = get_padding_info({src0, src1, src2, dst}); _add_bias = src2 != nullptr; _export_to_cl_image = rhs_info.export_to_cl_image; @@ -236,7 +265,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon // Create build options CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src0->data_type())); - build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), "-DALPHA=" + float_to_string_with_full_precision(alpha)); + build_opts.add_option_if(!(helpers::float_ops::is_one(alpha)), + "-DALPHA=" + float_to_string_with_full_precision(alpha)); build_opts.add_option_if(src2 != nullptr, "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(helpers::float_ops::is_one(beta), "-DUNIT_BETA"); build_opts.add_option_if(gemm_info.broadcast_bias, "-DBROADCAST_BIAS"); @@ -249,7 +279,8 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon build_opts.add_option("-DMMUL_M0=" + support::cpp11::to_string(mmul_m0)); build_opts.add_option("-DMMUL_N0=" + support::cpp11::to_string(mmul_n0)); build_opts.add_option("-DMMUL_K0=" + support::cpp11::to_string(mmul_k0)); - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); + build_opts.add_option("-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(gemm_info.activation_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.a())); build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(gemm_info.activation_info.b())); @@ -283,37 +314,44 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::configure(const CLCompileCon ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, +Status ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, - const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info) + const GEMMRHSMatrixInfo &rhs_info, + const GEMMKernelInfo &gemm_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src0, src1, src2, dst, alpha, beta, lhs_info, rhs_info, gemm_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), - src1->clone().get(), + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src0->clone().get(), src1->clone().get(), src2 != nullptr ? src2->clone().get() : nullptr, - dst->clone().get(), - lhs_info, - rhs_info, - gemm_info) - .first); + dst->clone().get(), lhs_info, rhs_info, gemm_info) + .first); return Status{}; } -void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto src2 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src0, src1, dst); ARM_COMPUTE_ERROR_ON(_add_bias && src2 == nullptr); - if(src1->info()->num_dimensions() < 3) + if (src1->info()->num_dimensions() < 3) { // The stride_z for matrix B must be zero if we do not slice ARM_COMPUTE_ERROR_ON(src1->info()->strides_in_bytes()[3] != 0); @@ -321,12 +359,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, cl::Image2D src1_image2d; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const TensorShape shape2d(src1->info()->dimension(0) / 4, src1->info()->dimension(1) * src1->info()->dimension(2)); + const TensorShape shape2d(src1->info()->dimension(0) / 4, + src1->info()->dimension(1) * src1->info()->dimension(2)); const size_t image_row_pitch = src1->info()->strides_in_bytes()[1]; - src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + src1_image2d = create_image2d_from_buffer(CLKernelLibrary::get().context(), src1->cl_buffer(), shape2d, + src1->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } Window slice = window.first_slice_window_3D(); @@ -336,14 +376,14 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, unsigned int idx = 0; add_3d_tensor_nhw_argument(idx, src0); - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, src1_image2d); } add_3d_tensor_nhw_argument(idx, src1); // Bias buffer (_add_bias == true) - if(_add_bias) + if (_add_bias) { add_3d_tensor_nhw_argument(idx, src2); } @@ -358,8 +398,7 @@ void ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::run_op(ITensorPack &tensors, // LWS_x should be multiple of 16 at least. (32, 2) has been chosen to have more work-items on a single core // LWS also enforces the order of execution of the workitems which improves cache utilization enqueue(queue, *this, slice, cl::NDRange(32, 2), false); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h index 59612fcf5d..86d3012f6e 100644 --- a/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h +++ b/src/gpu/cl/kernels/ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_GEMM_MATRIXMULTIPLY_RESHAPED_ONLY_RHS_MMUL_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -59,7 +60,13 @@ public: * rhs_info.transpose: false * @param[in] gemm_info GEMM information used to retrieve the original dimensions of the input matrices */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src0, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float alpha, float beta, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src0, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float alpha, + float beta, const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -69,7 +76,13 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float alpha, float beta, const GEMMLHSMatrixInfo &lhs_info, + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float alpha, + float beta, + const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const GEMMKernelInfo &gemm_info); @@ -77,11 +90,11 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _add_bias{ false }; - bool _export_to_cl_image{ false }; - signed int _m{ 1 }; - signed int _n{ 1 }; - signed int _k{ 1 }; + bool _add_bias{false}; + bool _export_to_cl_image{false}; + signed int _m{1}; + signed int _n{1}; + signed int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp index bf4b664b6e..eea2a169a3 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,13 +47,17 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.v0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), "Only 2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((lhs_info.k0 & (lhs_info.k0 - 1)) && lhs_info.k0 != 3), + "Only 2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(lhs_info.m0 < 2 || lhs_info.m0 > 8); ARM_COMPUTE_RETURN_ERROR_ON((lhs_info.m0 > 4 && lhs_info.m0 < 8) && lhs_info.transpose); @@ -60,10 +65,11 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), + misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -71,14 +77,15 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const return Status{}; } -Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +Window +configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) { const unsigned int num_elems_processed_per_iteration_x = lhs_info.k0; const unsigned int num_elems_processed_per_iteration_y = lhs_info.m0; TensorInfo tmp_info(*src); - if(reinterpret_input_as_3d) + if (reinterpret_input_as_3d) { // Since the src tensor has to be reinterpreted as 3D and the execute window is based on a 2D interleave, // the window needs to be constructed on the 2D collapsed version of the tensor @@ -88,10 +95,12 @@ Window configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixI } // dst auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape(*src, lhs_info, reinterpret_input_as_3d))); + auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_lhs_reshaped_shape( + *src, lhs_info, reinterpret_input_as_3d))); // Configure window - Window win = calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(tmp_info, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); // Collapse along the Z direction // This collapse needs to be here in order to tune the Z dimension of LWS @@ -106,14 +115,18 @@ ClGemmReshapeLhsMatrixKernel::ClGemmReshapeLhsMatrixKernel() _type = CLKernelType::ELEMENTWISE; } -void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_input_as_3d) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Perform validate step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); - auto padding_info = get_padding_info({ src }); + auto padding_info = get_padding_info({src}); const unsigned int src_w = src->dimension(0); const unsigned int m = reinterpret_input_as_3d ? src->dimension(1) * src->dimension(2) : src->dimension(1); @@ -168,7 +181,10 @@ void ClGemmReshapeLhsMatrixKernel::configure(const CLCompileContext &compile_con ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_input_as_3d) +Status ClGemmReshapeLhsMatrixKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_input_as_3d) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, lhs_info, reinterpret_input_as_3d)); return Status{}; @@ -179,8 +195,9 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -192,8 +209,7 @@ void ClGemmReshapeLhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi add_3d_tensor_nhw_argument(idx, src); add_3d_tensor_nhw_argument(idx, dst); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h index db88e0d735..8e84e8ad8e 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h +++ b/src/gpu/cl/kernels/ClGemmReshapeLhsMatrixKernel.h @@ -57,14 +57,21 @@ public: * lhs_info.interleave: true, false * @param[in] reinterpret_src_as_3d (Optional) True if the src has to be reinterpreted as 3D tensor */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d = false); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_src_as_3d = false); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmReshapeLhsMatrixKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMLHSMatrixInfo &lhs_info, bool reinterpret_src_as_3d); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMLHSMatrixInfo &lhs_info, + bool reinterpret_src_as_3d); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -72,4 +79,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_LHS_MATRIX_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp index b3a03880ed..b9ce3873c7 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp +++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.cpp @@ -31,6 +31,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -52,8 +53,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 == 0); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.h0 == 0); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), "Only 2,3,4,8,16 are supported for n0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), "Only 1,2,3,4,8,16 are supported for k0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 & (rhs_info.n0 - 1)) && rhs_info.n0 != 3), + "Only 2,3,4,8,16 are supported for n0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 & (rhs_info.k0 - 1)) && (rhs_info.k0 != 1) && (rhs_info.k0 != 3)), + "Only 1,2,3,4,8,16 are supported for k0"); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.n0 > 16); ARM_COMPUTE_RETURN_ERROR_ON(rhs_info.k0 > 16); ARM_COMPUTE_RETURN_ERROR_ON((rhs_info.k0 == 1) && (rhs_info.transpose)); @@ -61,15 +64,17 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, src->data_type()); + const TensorInfo tensor_reshaped_info(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info), 1, + src->data_type()); ARM_COMPUTE_RETURN_ON_ERROR(gemm::validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info)); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); } @@ -77,23 +82,27 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) { const unsigned int num_elems_processed_per_iteration_x = rhs_info.n0; const unsigned int num_elems_processed_per_iteration_y = rhs_info.k0; bool window_changed = false; // dst auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info))); + auto_init_if_empty( + *dst, src->clone()->set_tensor_shape(misc::shape_calculator::compute_rhs_reshaped_shape(*src, rhs_info))); // Configure window - Window win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); + Window win = + calculate_max_window(*src, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y)); - AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y); + AccessWindowRectangle src_access(src, 0, 0, num_elems_processed_per_iteration_x, + num_elems_processed_per_iteration_y); window_changed = update_window_and_padding(win, src_access); - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { gemm::update_padding_for_cl_image(dst); } @@ -102,7 +111,8 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso // This collapse needs to be here in order to tune the Z dimension of LWS Window collapsed = win.collapse(win, Window::DimZ); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, collapsed); } } // namespace @@ -112,7 +122,10 @@ ClGemmReshapeRhsMatrixKernel::ClGemmReshapeRhsMatrixKernel() _type = CLKernelType::ELEMENTWISE; } -void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMRHSMatrixInfo &rhs_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -143,7 +156,9 @@ void ClGemmReshapeRhsMatrixKernel::configure(const CLCompileContext &compile_con _kernel.setArg<cl_int>(idx++, rhs_info.h0); } -Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info) +Status ClGemmReshapeRhsMatrixKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const GEMMRHSMatrixInfo &rhs_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, rhs_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), rhs_info).first); @@ -156,8 +171,9 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); @@ -169,9 +185,8 @@ void ClGemmReshapeRhsMatrixKernel::run_op(ITensorPack &tensors, const Window &wi add_3d_tensor_nhw_argument(idx, src); add_3d_tensor_nhw_argument(idx, dst); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice)); + } while (window.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h index 31eaa46e02..7203d574fb 100644 --- a/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h +++ b/src/gpu/cl/kernels/ClGemmReshapeRhsMatrixKernel.h @@ -66,7 +66,10 @@ public: * rhs_info.transpose: true, false * rhs_info.interleave: true, false */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const GEMMRHSMatrixInfo &rhs_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const GEMMRHSMatrixInfo &rhs_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClGemmReshapeRhsMatrixKernel::configure() @@ -81,4 +84,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_GEMM_RESHAPE_RHS_MATRIX_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp index 719201d1fe..2e1cefc6e7 100644 --- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(Window::DimY) + height_offset > dst->dimension(Window::DimY)); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) != dst->dimension(0)); - for(size_t i = 2; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 2; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -62,8 +62,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int height_offset, co } } // namespace -ClHeightConcatenateKernel::ClHeightConcatenateKernel() - : _height_offset(0) +ClHeightConcatenateKernel::ClHeightConcatenateKernel() : _height_offset(0) { _type = CLKernelType::ELEMENTWISE; } @@ -74,12 +73,15 @@ Status ClHeightConcatenateKernel::validate(const ITensorInfo *src, unsigned int return Status{}; } -void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst) +void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int height_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, height_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _height_offset = height_offset; @@ -90,9 +92,10 @@ void ClHeightConcatenateKernel::configure(const CLCompileContext &compile_contex build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DHEIGHT_OFFSET=" + support::cpp11::to_string(_height_offset)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -125,8 +128,9 @@ void ClHeightConcatenateKernel::run_op(ITensorPack &tensors, const Window &windo ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); unsigned int idx = 0; add_4D_tensor_argument(idx, src, window); diff --git a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h index d3c077fc22..5a391a1212 100644 --- a/src/gpu/cl/kernels/ClHeightConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClHeightConcatenateKernel.h @@ -50,7 +50,8 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int height_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClHeightConcatenateKernel::configure() @@ -64,7 +65,7 @@ public: private: unsigned int _height_offset; - int32_t _depth{ 0 }; + int32_t _depth{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.cpp b/src/gpu/cl/kernels/ClIm2ColKernel.cpp index e890847199..ef7a52828f 100644 --- a/src/gpu/cl/kernels/ClIm2ColKernel.cpp +++ b/src/gpu/cl/kernels/ClIm2ColKernel.cpp @@ -29,9 +29,10 @@ #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -60,13 +61,19 @@ struct Im2ColConfiguration bool is_padding_required_nchw{}; }; -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_groups) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { const unsigned int channel_idx = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && has_bias); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); ARM_COMPUTE_RETURN_ERROR_ON((dilation.x() < 1) || (dilation.y() < 1)); @@ -82,9 +89,10 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const const unsigned total_height = src->dimension(height_idx) + conv_info.pad_top() + conv_info.pad_bottom(); ARM_COMPUTE_RETURN_ERROR_ON((total_width < kernel_dims.width) || (total_height < kernel_dims.height)); - if(dst->total_size() > 0) + if (dst->total_size() > 0) { - const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups)); + const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape( + compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -93,13 +101,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_elems_processed_per_iteration, bool is_padding_required_nchw, unsigned int num_groups) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_elems_processed_per_iteration, + bool is_padding_required_nchw, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups); + TensorShape expected_output_shape = + compute_im2col_conv_shape(src, kernel_dims, conv_info, has_bias, dilation, num_groups == 1, num_groups); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(expected_output_shape)); @@ -113,22 +129,22 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso bool window_changed = false; Window win; - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { win = calculate_max_window(*src, Steps(num_elems_processed_per_iteration)); } else { - if(is_padding_required_nchw) + if (is_padding_required_nchw) { - const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), conv_info.pad_left()); - win = calculate_max_window(*src, - Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); - AccessWindowStatic input_access(src, - -border.left, - -border.top, - ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration), - input_height + border.bottom); + const BorderSize border(conv_info.pad_top(), conv_info.pad_right(), conv_info.pad_bottom(), + conv_info.pad_left()); + win = calculate_max_window( + *src, Steps(num_elems_processed_per_iteration * conv_info.stride().first, conv_info.stride().second)); + AccessWindowStatic input_access( + src, -border.left, -border.top, + ceil_to_multiple(input_width + border.right, kernel_dims.width * num_elems_processed_per_iteration), + input_height + border.bottom); window_changed = window_changed || update_window_and_padding(win, input_access); } else @@ -142,11 +158,17 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *src, ITenso // set the Z dimension's step same size as the whole dimension so that one can't split across the Z dimension win.set_dimension_step(Window::DimZ, win[Window::DimZ].end() - win[Window::DimZ].start()); - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } -Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, unsigned int num_groups) +Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { const DataLayout data_layout = src->data_layout(); const DataType data_type = src->data_type(); @@ -157,7 +179,8 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D const unsigned int input_height = src->dimension(height_idx); const unsigned int input_channel = src->dimension(channel_idx); - const std::pair<unsigned int, unsigned int> convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + const std::pair<unsigned int, unsigned int> convolved_dims = + scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); // Im2Col configuration std::string kernel_name = "im2col_generic_"; @@ -184,21 +207,22 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D build_opts.add_option("-DDILATION_X=" + support::cpp11::to_string(dilation.x())); build_opts.add_option("-DDILATION_Y=" + support::cpp11::to_string(dilation.y())); build_opts.add_option_if(num_groups > 1, "-DNUM_GROUPS=" + support::cpp11::to_string(num_groups)); - build_opts.add_option_if_else(is_data_type_quantized(data_type), "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0"); + build_opts.add_option_if_else(is_data_type_quantized(data_type), + "-DPAD_VALUE=" + support::cpp11::to_string(qinfo.offset), "-DPAD_VALUE=0"); build_opts.add_option_if(has_bias, "-DHAS_BIAS"); - if(data_layout == DataLayout::NHWC) + if (data_layout == DataLayout::NHWC) { num_elems_processed_per_iteration = std::min(2U, input_channel); is_padding_required_nchw = false; // Only the 3x3 and 9x9 cases are optimized for NHWC - if(kernel_dims == Size2D(3U, 3U)) + if (kernel_dims == Size2D(3U, 3U)) { kernel_name = "im2col3x3_"; build_opts.add_option("-DIM2COL_3X3"); } - else if(kernel_dims == Size2D(9U, 9U)) + else if (kernel_dims == Size2D(9U, 9U)) { kernel_name = "im2col9x9_"; build_opts.add_option("-DIM2COL_9X9"); @@ -219,17 +243,17 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D } else { - if(dilation == Size2D(1U, 1U)) + if (dilation == Size2D(1U, 1U)) { const bool squared_im2col = kernel_dims.width == kernel_dims.height; - if(squared_im2col) + if (squared_im2col) { // Check if we can run an optimized im2col for NCHW - switch(kernel_dims.width) + switch (kernel_dims.width) { case 1: // Optimized im2col1x1 if stride_x = 1 and conv_info.has_padding() = false - if(conv_info.stride().first == 1 && !conv_info.has_padding()) + if (conv_info.stride().first == 1 && !conv_info.has_padding()) { kernel_name = "im2col1x1_stridex1_"; num_elems_processed_per_iteration = 4; @@ -248,7 +272,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D break; case 11: // Optimized im2col11x11 if pad_x = pad_y = 0 - if(!conv_info.has_padding()) + if (!conv_info.has_padding()) { kernel_name = "im2col11x11_padx0_pady0_"; num_elems_processed_per_iteration = 1; @@ -262,7 +286,7 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D break; } } - else if(kernel_dims.width > 1 && !conv_info.has_padding()) + else if (kernel_dims.width > 1 && !conv_info.has_padding()) { kernel_name = "im2col_generic_padx0_pady0_"; num_elems_processed_per_iteration = 1; @@ -297,19 +321,29 @@ Im2ColConfiguration configure_opencl_kernel(const ITensorInfo *src, const Size2D } // namespace ClIm2ColKernel::ClIm2ColKernel() - : _data_layout(DataLayout::UNKNOWN), _convolved_dims(), _num_elems_processed_per_iteration(1), _kernel_dims(), _conv_info(), _num_groups() + : _data_layout(DataLayout::UNKNOWN), + _convolved_dims(), + _num_elems_processed_per_iteration(1), + _kernel_dims(), + _conv_info(), + _num_groups() { _type = CLKernelType::ELEMENTWISE; } -void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, - const Size2D &dilation, - unsigned int num_groups) +void ClIm2ColKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); _data_layout = src->data_layout(); const unsigned int width_idx = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); @@ -320,19 +354,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI // Select and configure the optimal OpenCL kernel to run. // This function returns the OpenCL kernel's name, the arguments to pass at compile time, the number of elements processed per iteration // and the padding requirement flag - Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); + Im2ColConfiguration im2col_config = + configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); // Create kernel _kernel = create_kernel(compile_context, im2col_config.kernel_name, im2col_config.build_options); - _convolved_dims = scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); + _convolved_dims = + scaled_dimensions(input_width, input_height, kernel_dims.width, kernel_dims.height, conv_info, dilation); _num_elems_processed_per_iteration = im2col_config.num_elems_processed_per_iteration; _kernel_dims = kernel_dims; // Only needed by the Tuner _conv_info = conv_info; // Only needed by the Tuner _num_groups = num_groups; // Configure kernel window - auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + auto win_config = validate_and_configure_window(src, dst, kernel_dims, conv_info, has_bias, dilation, + im2col_config.num_elems_processed_per_iteration, im2col_config.is_padding_required_nchw, num_groups); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -353,14 +390,22 @@ void ClIm2ColKernel::configure(const ClCompileContext &compile_context, ITensorI ARM_COMPUTE_ERROR_ON(src->data_layout() == DataLayout::NHWC && has_padding_changed(padding_info)); } -Status ClIm2ColKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation, - unsigned int num_groups) +Status ClIm2ColKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, kernel_dims, conv_info, has_bias, dilation, num_groups)); - Im2ColConfiguration im2col_config = configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, conv_info, has_bias, dilation, im2col_config.num_elems_processed_per_iteration, + Im2ColConfiguration im2col_config = + configure_opencl_kernel(src, kernel_dims, conv_info, has_bias, dilation, num_groups); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), kernel_dims, + conv_info, has_bias, dilation, + im2col_config.num_elems_processed_per_iteration, im2col_config.is_padding_required_nchw, num_groups) - .first); + .first); return Status{}; } @@ -388,7 +433,7 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm Window slice_in = first_slice_3d; Window slice_out = window_output.first_slice_window_2D(); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { const Window tmp_win = window.collapse_if_possible(ICLKernel::window(), 3); const int num_batches = tmp_win[3].end(); @@ -398,7 +443,10 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm } else { - slice.set(0, Window::Dimension(0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), _num_elems_processed_per_iteration)); + slice.set(0, + Window::Dimension( + 0, static_cast<int>(ceil_to_multiple(_convolved_dims.first, _num_elems_processed_per_iteration)), + _num_elems_processed_per_iteration)); slice.set(1, Window::Dimension(0, static_cast<int>(_convolved_dims.second), 1)); // Note: In case of NCHW the 3rd dimension is already set collapsing the input window } @@ -414,14 +462,16 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - unsigned int idx = num_arguments_per_3D_tensor() + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); + unsigned int idx = num_arguments_per_3D_tensor() + + (_num_groups == 1 ? num_arguments_per_2D_tensor() : num_arguments_per_3D_tensor()); _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(src->info()->strides_in_bytes()[3])); - _kernel.setArg<cl_uint>(idx++, static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)])); + _kernel.setArg<cl_uint>(idx++, + static_cast<unsigned int>(dst->info()->strides_in_bytes()[((_num_groups == 1) ? 2 : 3)])); do { unsigned int idx = 0; add_3D_tensor_argument(idx, src, slice_in); - if(_num_groups == 1) + if (_num_groups == 1) { add_2D_tensor_argument(idx, dst, slice_out); } @@ -430,8 +480,8 @@ void ClIm2ColKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm add_3D_tensor_argument(idx, dst, slice_out); } enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && window_collapsed.slide_window_slice_3D(slice_in)); + } while (window_collapsed.slide_window_slice_3D(slice) && window_output.slide_window_slice_2D(slice_out) && + window_collapsed.slide_window_slice_3D(slice_in)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClIm2ColKernel.h b/src/gpu/cl/kernels/ClIm2ColKernel.h index a637ad215d..c8cd5b328d 100644 --- a/src/gpu/cl/kernels/ClIm2ColKernel.h +++ b/src/gpu/cl/kernels/ClIm2ColKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Size2D.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -77,28 +78,38 @@ public: * @param[in] dilation (Optional) Dilation, in elements, across x and y. Defaults to (1, 1). * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, - const Size2D &dilation = Size2D(1U, 1U), - unsigned int num_groups = 1); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to ClIm2ColKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *output, const Size2D &kernel_dims, const PadStrideInfo &conv_info, bool has_bias, const Size2D &dilation = Size2D(1U, 1U), - unsigned int num_groups = 1); + static Status validate(const ITensorInfo *input, + const ITensorInfo *output, + const Size2D &kernel_dims, + const PadStrideInfo &conv_info, + bool has_bias, + const Size2D &dilation = Size2D(1U, 1U), + unsigned int num_groups = 1); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; public: - DataLayout _data_layout; + DataLayout _data_layout; std::pair<unsigned int, unsigned int> _convolved_dims; - unsigned int _num_elems_processed_per_iteration; - Size2D _kernel_dims; - PadStrideInfo _conv_info; - unsigned int _num_groups; + unsigned int _num_elems_processed_per_iteration; + Size2D _kernel_dims; + PadStrideInfo _conv_info; + unsigned int _num_groups; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp index d291fad76c..8c493d08c6 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp +++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,26 +44,29 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(0) != src->dimension(0), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, + "M0 can only be greater than 0 and less than or equal to 8"); // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), + misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(), + weights->tensor_shape(), conv_info, desc)); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::S32); } @@ -75,8 +79,12 @@ ClIndirectConv2dAddressPrecalculationKernel::ClIndirectConv2dAddressPrecalculati _type = CLKernelType::ELEMENTWISE; } -void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc) +void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, dst, conv_info, desc)); @@ -85,11 +93,8 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte constexpr unsigned int height_idx = 2; // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc); + TensorShape output_shape = misc::shape_calculator::compute_indirect_buffer_shape( + src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc); TensorShape output_conv_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); @@ -136,14 +141,19 @@ void ClIndirectConv2dAddressPrecalculationKernel::configure(const CLCompileConte // Since this kernel should be called only once, we do not need to set the config_id for tuning } -Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc) +Status ClIndirectConv2dAddressPrecalculationKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, dst, conv_info, desc)); return Status{}; } -void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) +void ClIndirectConv2dAddressPrecalculationKernel::run_op(ITensorPack &tensors, + const Window &window, + cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(IKernel::window(), window); diff --git a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h index ff7f4be147..b565609c6a 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h +++ b/src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h @@ -60,16 +60,23 @@ public: * @param[in] conv_info Contains padding and stride information described in @ref PadStrideInfo. * @param[in] desc Direct convolution descriptor used to build the NHWC direct/indirect convolution kernel. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc); /** Static function to check if given info will lead to a valid configuration * * Similar to ClIndirectConv2dAddressPreCalculationKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const DirectConvComputeKernelInfo &desc); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const DirectConvComputeKernelInfo &desc); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp index a337eb50fd..3510b6970c 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp +++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.cpp @@ -23,13 +23,14 @@ */ #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/KernelDescriptors.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLUtils.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -46,8 +47,14 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *indirect_buffer, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); @@ -55,37 +62,38 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(indirect_buffer, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(src, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(indirect_buffer->tensor_shape(), - misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + indirect_buffer->tensor_shape(), + misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), src->data_layout(), + weights->tensor_shape(), conv_info, desc)); constexpr int channel_idx = 0; constexpr int batch_idx = 3; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != src->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, "M0 can only be greater than 0 and less than or equal to 8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.m0 <= 0 || desc.m0 > 8, + "M0 can only be greater than 0 and less than or equal to 8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && desc.n0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.n0 != 1 && desc.n0 != 2 && desc.n0 != 3 && desc.n0 != 4 && desc.n0 != 8 && + desc.n0 != 16, "N0 can only be: 1, 2, 3, 4, 8, and 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 1 && desc.k0 != 2 && desc.k0 != 3 && desc.k0 != 4 && desc.k0 != 8 && + desc.k0 != 16, "K0 can only be: 1, 2, 3, 4, 8, and 16"); - if(desc.export_weights_to_cl_image) + if (desc.export_weights_to_cl_image) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, - "K0 can only be: 4, 8, and 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(desc.k0 != 4 && desc.k0 != 8 && desc.k0 != 16, "K0 can only be: 4, 8, and 16"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(weights), "Export to CLImage is not supported for this weight configuration"); } - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -95,15 +103,14 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co } ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->dimension(channel_idx) != weights->dimension(batch_idx), "Biases size and number of dst feature maps should match"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, - "Biases should be one dimensional"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(biases->num_dimensions() > 1, "Biases should be one dimensional"); } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), - misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + dst->tensor_shape(), misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); } @@ -116,13 +123,21 @@ ClIndirectConv2dKernel::ClIndirectConv2dKernel() _type = CLKernelType::DIRECT; } -void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *indirect_buffer, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *indirect_buffer, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, indirect_buffer, dst); // Perform validation - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); constexpr unsigned int channel_idx = 0; constexpr unsigned int width_idx = 1; @@ -137,10 +152,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); // Output auto inizialitation if not yet initialized - auto_init_if_empty(*dst, output_shape, - 1, - src->data_type(), - src->quantization_info()); + auto_init_if_empty(*dst, output_shape, 1, src->data_type(), src->quantization_info()); // Configure kernel window Window win; @@ -164,7 +176,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, _export_to_cl_image = desc.export_weights_to_cl_image; // Update the padding for the weights tensor if we can export to cl_image - if(_export_to_cl_image) + if (_export_to_cl_image) { gemm::update_padding_for_cl_image(weights); } @@ -173,11 +185,12 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, // When M0 is 5, 6, and 7, we use vload8 to fetch the data from the buffer const unsigned int load_indirect_buf_size = m0 > 4 ? 8 : m0; const unsigned int indirect_buf_width = indirect_buffer->tensor_shape()[0]; - const unsigned int round_up_width = ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size; - const unsigned int padding = round_up_width - indirect_buf_width; + const unsigned int round_up_width = + ((indirect_buf_width + load_indirect_buf_size - 1) / load_indirect_buf_size) * load_indirect_buf_size; + const unsigned int padding = round_up_width - indirect_buf_width; indirect_buffer->extend_padding(PaddingSize(0, indirect_buffer->padding().right + padding, 0, 0)); - if(biases != nullptr) + if (biases != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); @@ -186,9 +199,10 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, // Conditions of -cl-fast-relaxed-math causing accuracy issues can be traced from COMPMID-5324 const auto act_function = act_info.activation(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (data_type == DataType::F32 || data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (data_type == DataType::F32 || data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -224,7 +238,7 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, // A macro guard to compile ONLY the kernel of interest build_options.add_option("-D" + upper_string(kernel_name.str())); - if(compile_context.get_ddk_version() >= 30) + if (compile_context.get_ddk_version() >= 30) { build_options.add_option("-fregister-allocation=64"); } @@ -253,10 +267,17 @@ void ClIndirectConv2dKernel::configure(const CLCompileContext &compile_context, _config_id += support::cpp11::to_string(dst->dimension(channel_idx)); } -Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *indirect_buffer, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc) +Status ClIndirectConv2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *indirect_buffer, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, weights, biases, indirect_buffer, dst, conv_info, act_info, desc)); return Status{}; } @@ -268,35 +289,42 @@ void ClIndirectConv2dKernel::run_op(ITensorPack &tensors, const Window &window, // Get initial windows Window slice = window.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - const auto indirect_buffer = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + const auto indirect_buffer = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_3)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); cl::Image2D weights_cl_image; - if(_export_to_cl_image) + if (_export_to_cl_image) { - const size_t image_w = weights->info()->dimension(0) / 4; - const size_t image_h = weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); + const size_t image_w = weights->info()->dimension(0) / 4; + const size_t image_h = + weights->info()->dimension(1) * weights->info()->dimension(2) * weights->info()->dimension(3); const TensorShape shape2d(image_w, image_h); const size_t image_row_pitch = weights->info()->strides_in_bytes()[1]; // Export cl_buffer to cl_image - weights_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + weights_cl_image = + create_image2d_from_buffer(CLKernelLibrary::get().context(), weights->cl_buffer(), shape2d, + weights->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); } unsigned int idx = 0; add_4d_tensor_nhwc_argument(idx, src); add_4d_tensor_nhwc_argument(idx, indirect_buffer); add_4d_tensor_nhwc_argument(idx, dst); - if(_export_to_cl_image) + if (_export_to_cl_image) { _kernel.setArg(idx++, weights_cl_image); } add_4d_tensor_nhwc_argument(idx, weights); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } diff --git a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h index b6c7b35fa4..04166d417e 100644 --- a/src/gpu/cl/kernels/ClIndirectConv2dKernel.h +++ b/src/gpu/cl/kernels/ClIndirectConv2dKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_DIRECT_CONV2D_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -60,22 +61,35 @@ public: * @param[in] act_info Contains activaton information described in @ref ActivationLayerInfo. * @param[in] desc Direct convolution descriptor used to build the NHWC indirect convolution kernel. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *off, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *off, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); /** Static function to check if given info will lead to a valid configuration * * Similar to ClIndirectConv2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *off, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, const DirectConvComputeKernelInfo &desc); + static Status validate(const ITensorInfo *src, + const ITensorInfo *off, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + const DirectConvComputeKernelInfo &desc); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; public: - bool _export_to_cl_image{ false }; + bool _export_to_cl_image{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp index 66331bc818..0bb6b0c083 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.cpp @@ -29,17 +29,16 @@ #include "arm_compute/core/QuantizationInfo.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -62,51 +61,62 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0"); - if(!adj_lhs || adj_rhs) + if (!adj_lhs || adj_rhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), + "Only 1,2,3,4,8,16 are supported for K0"); } return Status{}; } -} +} // namespace ClMatMulLowpNativeKernel::ClMatMulLowpNativeKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && act_info.activation() != ActivationFunction::RELU - && act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && act_info.activation() != ActivationFunction::BOUNDED_RELU), + ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY && + act_info.activation() != ActivationFunction::RELU && + act_info.activation() != ActivationFunction::LU_BOUNDED_RELU && + act_info.activation() != ActivationFunction::BOUNDED_RELU), "Activation Function specified is unsupported."); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -115,7 +125,12 @@ Status ClMatMulLowpNativeKernel::validate(const ITensorInfo *lhs, const ITensorI return Status{}; } -void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); @@ -123,7 +138,8 @@ void ClMatMulLowpNativeKernel::configure(const ClCompileContext &compile_context ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); const int m = dst->dimension(1); const int n = dst->dimension(0); @@ -217,10 +233,13 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const ICLTensor *lhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const ICLTensor *bias = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); @@ -229,7 +248,7 @@ void ClMatMulLowpNativeKernel::run_op(ITensorPack &tensors, const Window &window add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h index 64415f42f7..ffdb720855 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEKERNEL #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -54,7 +55,12 @@ public: * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel * @param[in] act_info (Optional) Class containing information about fused activation function. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -62,7 +68,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp index 464212d7db..94e3c4e47b 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.cpp @@ -28,10 +28,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" @@ -64,13 +64,15 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 - ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 4), "Only 4 is supported for k0"); @@ -84,7 +86,11 @@ ClMatMulLowpNativeMMULKernel::ClMatMulLowpNativeMMULKernel() _type = CLKernelType::GEMM; } -Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); @@ -102,16 +108,17 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen ARM_COMPUTE_RETURN_ERROR_ON_MSG((act_info.activation() != ActivationFunction::IDENTITY), "Activation Function specified is unsupported."); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_output = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(bias, 1, DataType::S32); ARM_COMPUTE_RETURN_ERROR_ON(bias->num_dimensions() > 1); @@ -121,15 +128,21 @@ Status ClMatMulLowpNativeMMULKernel::validate(const ITensorInfo *lhs, const ITen return Status{}; } -void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, - const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) +void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info, act_info); ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); ARM_COMPUTE_UNUSED(compile_context, lhs, rhs, bias, matmul_kernel_info, act_info); CLBuildOptions build_opts; @@ -147,7 +160,8 @@ void ClMatMulLowpNativeMMULKernel::configure(const ClCompileContext &compile_con const unsigned int n0_leftover = n % n0; // Configure kernel window - const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); + const auto win_config = + validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -215,10 +229,13 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present - auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto *lhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto *rhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto *bias = utils::cast::polymorphic_downcast<const ICLTensor *>( + tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present + auto *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); @@ -227,7 +244,7 @@ void ClMatMulLowpNativeMMULKernel::run_op(ITensorPack &tensors, const Window &wi add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h index d2aa40b2e2..6c56f15d74 100644 --- a/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h +++ b/src/gpu/cl/kernels/ClMatMulLowpNativeMMULKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLMATMULLOWPNATIVEMMULKERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -49,7 +50,12 @@ public: * * @return a status */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -57,7 +63,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp index 41ba5d5e25..a1fa9fa9ab 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.cpp @@ -28,9 +28,9 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/utils/ActivationFunctionUtils.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/CL/CLUtils.h" @@ -38,7 +38,6 @@ #include "src/core/helpers/WindowHelpers.h" #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -61,20 +60,23 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((m0 & (m0 - 1)) && (m0 != 3)) || (m0 > 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((n0 & (n0 - 1)) && (n0 != 3)) || (n0 > 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 < 1, "Only positive integers are supported for K0"); - if(!adj_lhs || adj_rhs) + if (!adj_lhs || adj_rhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), "Only 1,2,3,4,8,16 are supported for K0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((k0 & (k0 - 1)) && (k0 != 3)) || (k0 > 16), + "Only 1,2,3,4,8,16 are supported for K0"); } return Status{}; @@ -83,30 +85,37 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) Status validate_export_to_cl_image(const ITensorInfo *rhs, const MatMulKernelInfo &matmul_kernel_info) { ARM_COMPUTE_RETURN_ERROR_ON(matmul_kernel_info.export_rhs_to_cl_image && rhs->lock_paddings()); - if(matmul_kernel_info.export_rhs_to_cl_image) + if (matmul_kernel_info.export_rhs_to_cl_image) { - if(matmul_kernel_info.adj_rhs) + if (matmul_kernel_info.adj_rhs) { const int k0 = matmul_kernel_info.k0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, "K0 can only be: 4, 8, and 16 for Rhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(k0 != 4 && k0 != 8 && k0 != 16, + "K0 can only be: 4, 8, and 16 for Rhs transposed"); } else { const int n0 = matmul_kernel_info.n0; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, "N0 can only be: 4, 8, and 16 for Rhs non-transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 != 4 && n0 != 8 && n0 != 16, + "N0 can only be: 4, 8, and 16 for Rhs non-transposed"); } - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), "Export to CLImage is not supported for this device/configuration"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!export_to_cl_image(rhs), + "Export to CLImage is not supported for this device/configuration"); } return Status{}; } -} +} // namespace ClMatMulNativeKernel::ClMatMulNativeKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); @@ -114,28 +123,36 @@ Status ClMatMulNativeKernel::validate(const ITensorInfo *lhs, const ITensorInfo ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_matmul_input_shapes(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_export_to_cl_image(rhs, matmul_kernel_info)); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(bias, lhs); ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], + "First dimension of bias and output tensors must match."); } return Status{}; } -void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, +void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst, &compile_context, &matmul_kernel_info); @@ -143,7 +160,8 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); const int m = dst->dimension(1); const int n = dst->dimension(0); @@ -187,7 +205,7 @@ void ClMatMulNativeKernel::configure(const ClCompileContext &compile_context, IT // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); - if(_export_rhs_to_cl_image) + if (_export_rhs_to_cl_image) { gemm::update_padding_for_cl_image(rhs); } @@ -222,10 +240,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present - ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const ICLTensor *lhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>( + tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present + ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); @@ -235,7 +256,7 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl add_3d_tensor_nhw_argument(idx, lhs); cl::Image2D rhs_cl_image; - if(_export_rhs_to_cl_image) + if (_export_rhs_to_cl_image) { const size_t image_w = rhs->info()->dimension(0) / 4; const size_t image_h = rhs->info()->tensor_shape().total_size() / rhs->info()->dimension(0); @@ -243,12 +264,13 @@ void ClMatMulNativeKernel::run_op(ITensorPack &tensors, const Window &window, cl const size_t image_row_pitch = rhs->info()->strides_in_bytes()[1]; // Export cl_buffer to cl_image - rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); + rhs_cl_image = create_image2d_from_buffer(CLKernelLibrary::get().context(), rhs->cl_buffer(), shape2d, + rhs->info()->data_type(), image_row_pitch, CLImage2DType::ReadOnly); _kernel.setArg(idx++, rhs_cl_image); } add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulNativeKernel.h b/src/gpu/cl/kernels/ClMatMulNativeKernel.h index fe2b787c12..2cb150bc8f 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeKernel.h +++ b/src/gpu/cl/kernels/ClMatMulNativeKernel.h @@ -25,6 +25,7 @@ #define ACL_SRC_GPU_CL_KERNELS_CLMATMULNATIVEKERNEL #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -52,7 +53,12 @@ public: * @param[in] matmul_kernel_info Attributes for Batch MatMul Kernel * @param[in] act_info (Optional) Specifies activation function to use after Matrix multiplication. Default is Identity function. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -60,14 +66,18 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - bool _export_rhs_to_cl_image{ false }; + bool _export_rhs_to_cl_image{false}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp index 432270e8bf..76bf846e74 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp +++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.cpp @@ -28,14 +28,13 @@ #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/StringUtils.h" #include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -62,31 +61,38 @@ Status validate_matmul_kernel_info(const MatMulKernelInfo &matmul_kernel_info) // Validate M0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(m0 < 1, "Only positive integers are supported for M0"); - if(adj_lhs) + if (adj_lhs) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((m0 != 1) && (m0 != 2) && (m0 != 3) && (m0 != 4) && (m0 != 8) && (m0 != 16), + "Only 1,2,3,4,8,16 are supported for M0 for Lhs transposed"); } // Validate N0 ARM_COMPUTE_RETURN_ERROR_ON_MSG(n0 < 1, "Only positive integers are supported for N0"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), "Only 1,2,3,4,8,16 are supported for N0"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((n0 != 1) && (n0 != 2) && (n0 != 3) && (n0 != 4) && (n0 != 8) && (n0 != 16), + "Only 1,2,3,4,8,16 are supported for N0"); // Validate K0 ARM_COMPUTE_RETURN_ERROR_ON_MSG((k0 != 1), "Only 1 is supported for k0"); return Status{}; } -} +} // namespace ClMatMulNativeMMULKernel::ClMatMulNativeMMULKernel() { _type = CLKernelType::GEMM; } -Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) +Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), "The extension cl_arm_matrix_multiply is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(!arm_matrix_multiply_supported(CLKernelLibrary::get().get_device()), + "The extension cl_arm_matrix_multiply is not supported on the target platform"); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, rhs); ARM_COMPUTE_RETURN_ON_ERROR(validate_matmul_kernel_info(matmul_kernel_info)); @@ -96,32 +102,40 @@ Status ClMatMulNativeMMULKernel::validate(const ITensorInfo *lhs, const ITensorI const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x(); ARM_COMPUTE_RETURN_ERROR_ON_MSG_VAR((lhs_k % mmul_k0) != 0, "K dimension must be a multiple of %d", mmul_k0); - const TensorShape expected_output_shape = misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); + const TensorShape expected_output_shape = + misc::shape_calculator::compute_matmul_shape(lhs_shape, rhs->tensor_shape(), matmul_kernel_info); - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorInfo tensor_info_dst = dst->clone()->set_tensor_shape(expected_output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &tensor_info_dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, dst); } - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MSG((bias->num_dimensions() > 1), "Multi dimensional bias is unsupported."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], "First dimension of bias and output tensors must match."); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(bias->dimension(0) != expected_output_shape[0], + "First dimension of bias and output tensors must match."); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(lhs, bias); } return Status{}; } -void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info) +void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst, matmul_kernel_info); ARM_COMPUTE_ERROR_THROW_ON(validate(lhs, rhs, bias, dst, matmul_kernel_info)); // dst tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape(lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); + auto_init_if_empty(*dst, lhs->clone()->set_tensor_shape(misc::shape_calculator::compute_matmul_shape( + lhs->tensor_shape(), rhs->tensor_shape(), matmul_kernel_info))); const int m = dst->dimension(1); const int n = dst->dimension(0); @@ -135,7 +149,8 @@ void ClMatMulNativeMMULKernel::configure(const ClCompileContext &compile_context const int n0 = adjust_vec_size(matmul_kernel_info.n0, n); // Configure kernel window - const auto win_config = validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); + const auto win_config = + validate_and_configure_window_for_mmul_kernels(lhs, rhs, dst, matmul_kernel_info, mmul_m0, mmul_n0); ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); @@ -186,17 +201,20 @@ void ClMatMulNativeMMULKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const ICLTensor *lhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const ICLTensor *rhs = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present - ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const ICLTensor *lhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const ICLTensor *rhs = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const ICLTensor *bias = utils::cast::polymorphic_downcast<const ICLTensor *>( + tensors.get_const_tensor(TensorType::ACL_SRC_2)); // nullptr if bias is not present + ICLTensor *dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, bias, dst); unsigned int idx = 0; add_3d_tensor_nhw_argument(idx, lhs); add_3d_tensor_nhw_argument(idx, rhs); - if(bias != nullptr) + if (bias != nullptr) { add_3d_tensor_nhw_argument(idx, bias); } diff --git a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h index 80448974c4..1aeb896325 100644 --- a/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h +++ b/src/gpu/cl/kernels/ClMatMulNativeMMULKernel.h @@ -72,22 +72,31 @@ public: * @param[out] dst Output tensor info. * @param[in] matmul_info Attributes for Batch MatMul Kernel */ - void configure(const ClCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *bias, ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *bias, + ITensorInfo *dst, + const MatMulKernelInfo &matmul_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMatMulNativeMMULKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *bias, const ITensorInfo *dst, const MatMulKernelInfo &matmul_info); + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *bias, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - int _m{ 1 }; - int _n{ 1 }; - int _k{ 1 }; + int _m{1}; + int _n{1}; + int _k{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMulKernel.cpp b/src/gpu/cl/kernels/ClMulKernel.cpp index 5ca0639852..3b59c2a7fc 100644 --- a/src/gpu/cl/kernels/ClMulKernel.cpp +++ b/src/gpu/cl/kernels/ClMulKernel.cpp @@ -23,15 +23,16 @@ */ #include "src/gpu/cl/kernels/ClMulKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -46,24 +47,25 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status validate_arguments(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(overflow_policy); ARM_COMPUTE_UNUSED(rounding_policy); ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, - DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, DataType::S32, - DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MSG(scale < 0, "Scale cannot be negative."); ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); @@ -76,27 +78,35 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons ARM_COMPUTE_RETURN_ERROR_ON_MSG(out_shape.total_size() == 0, "Inputs are not broadcast compatible"); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, - 1, - DataType::U8, DataType::QASYMM8, DataType::QASYMM8_SIGNED, - DataType::S16, DataType::QSYMM16, DataType::F16, - DataType::S32, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::U8, DataType::QASYMM8, + DataType::QASYMM8_SIGNED, DataType::S16, DataType::QSYMM16, + DataType::F16, DataType::S32, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::U8 && + (src1->data_type() != DataType::U8 || src2->data_type() != DataType::U8), "Dst can only be U8 if both src are U8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8 && (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), - "Dst can only be QASYMM8 if both src are QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QASYMM8_SIGNED && (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), - "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(dst->data_type() == DataType::QSYMM16 && (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), - "Dst can only be QSYMM16 if both src are QSYMM16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && (dst->data_type() != DataType::S32), + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->data_type() == DataType::QASYMM8 && + (src1->data_type() != DataType::QASYMM8 || src2->data_type() != DataType::QASYMM8), + "Dst can only be QASYMM8 if both src are QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->data_type() == DataType::QASYMM8_SIGNED && + (src1->data_type() != DataType::QASYMM8_SIGNED || src2->data_type() != DataType::QASYMM8_SIGNED), + "Dst can only be QASYMM8_SIGNED if both src are QASYMM8_SIGNED"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + dst->data_type() == DataType::QSYMM16 && + (src1->data_type() != DataType::QSYMM16 || src2->data_type() != DataType::QSYMM16), + "Dst can only be QSYMM16 if both src are QSYMM16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((src1->data_type() == DataType::S32 || src2->data_type() == DataType::S32) && + (dst->data_type() != DataType::S32), "Dst must be S32 if source tensors are S32"); - if(in_place) + if (in_place) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), - "Wrong shape for dst, cannot do in_place calculation"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + detail::have_different_dimensions(out_shape, + src1_in_place ? src1->tensor_shape() : src2->tensor_shape(), 0), + "Wrong shape for dst, cannot do in_place calculation"); } else { @@ -114,14 +124,19 @@ ClMulKernel::ClMulKernel() _type = CLKernelType::ELEMENTWISE; } -void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void ClMulKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); - ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, - scale, overflow_policy, rounding_policy, act_info)); + ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); @@ -133,7 +148,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo // Use int scaling if factor is equal to 1/2^n for 0 <= n <= 15 // frexp returns 0.5 as mantissa which means that the exponent will be in the range of -1 <= e <= 14 // Moreover, it will be negative as we deal with 1/2^n - if((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) + if ((normalized_mantissa == 0.5f) && (-14 <= exponent) && (exponent <= 1)) { // Store the positive exponent. We know that we compute 1/2^n // Additionally we need to subtract 1 to compensate that frexp used a mantissa of 0.5 @@ -142,19 +157,19 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo std::string acc_type; // Check if it has float src and dst - if(is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) + if (is_data_type_float(src1->data_type()) || is_data_type_float(src2->data_type())) { scale_int = -1; acc_type = (src1->data_type() == DataType::F32 || src2->data_type() == DataType::F32) ? "float" : "half"; } else { - if(src1->element_size() == 4 || src2->element_size() == 4) + if (src1->element_size() == 4 || src2->element_size() == 4) { // use 64 bit accumulator for 32-bit input acc_type = "long"; } - else if(src1->element_size() == 2 || src2->element_size() == 2) + else if (src1->element_size() == 2 || src2->element_size() == 2) { // Use 32-bit accumulator for 16-bit input acc_type = "int"; @@ -176,11 +191,15 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo build_opts.add_option("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(src1->data_type())); build_opts.add_option("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(src2->data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); - build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) ? "1" : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_IN1=" + ((dst->dimension(0) != 1 && src1->dimension(0) == 1) + ? "1" + : support::cpp11::to_string(vec_size))); + build_opts.add_option("-DVEC_SIZE_IN2=" + ((dst->dimension(0) != 1 && src2->dimension(0) == 1) + ? "1" + : support::cpp11::to_string(vec_size))); build_opts.add_option("-DVEC_SIZE_OUT=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); - if(is_quantized && (dst->data_type() != DataType::S32)) + if (is_quantized && (dst->data_type() != DataType::S32)) { const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); @@ -200,12 +219,14 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo else { kernel_name += (scale_int >= 0) ? "_int" : "_float"; - build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), "-DWRAP", "-DSATURATE"); + build_opts.add_option_if_else(overflow_policy == ConvertPolicy::WRAP || is_data_type_float(dst->data_type()), + "-DWRAP", "-DSATURATE"); build_opts.add_option_if_else(rounding_policy == RoundingPolicy::TO_ZERO, "-DROUND=_rtz", "-DROUND=_rte"); build_opts.add_option("-DACC_DATA_TYPE=" + acc_type); - if(act_info.enabled()) + if (act_info.enabled()) { - build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); + build_opts.add_option("-DACTIVATION_TYPE=" + + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option("-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); } @@ -223,7 +244,7 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo // Set scale argument unsigned int idx = (in_place ? 2 : 3) * num_arguments_per_3D_tensor(); // Skip the src and dst parameters - if(scale_int >= 0 && !is_quantized) + if (scale_int >= 0 && !is_quantized) { _kernel.setArg(idx++, scale_int); } @@ -261,8 +282,13 @@ void ClMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo _config_id += support::cpp11::to_string(dst->dimension(2)); } -Status ClMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status ClMulKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info)); @@ -275,9 +301,11 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src_0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); ARM_COMPUTE_ERROR_ON_NULLPTR(src_0, src_1, dst); @@ -286,17 +314,18 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command const TensorShape &out_shape = dst->info()->tensor_shape(); bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -312,7 +341,7 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command unsigned int idx = 0; add_3D_tensor_argument(idx, src_0, slice_input1); add_3D_tensor_argument(idx, src_1, slice_input2); - if(!in_place) + if (!in_place) { add_3D_tensor_argument(idx, dst, slice); } @@ -320,15 +349,17 @@ void ClMulKernel::run_op(ITensorPack &tensors, const Window &window, cl::Command ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } namespace { constexpr unsigned int vec_size_complex = 1; -Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status validate_arguments_complex(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src1, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src2, 2, DataType::F16, DataType::F32); @@ -340,11 +371,12 @@ Status validate_arguments_complex(const ITensorInfo *src1, const ITensorInfo *sr ARM_COMPUTE_RETURN_ERROR_ON(act_info.enabled() && !is_data_type_float(dst->data_type())); // Validate in case of configured dst - if(dst->total_size() > 0) + if (dst->total_size() > 0) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 2, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), "Wrong shape for dst"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(detail::have_different_dimensions(out_shape, dst->tensor_shape(), 0), + "Wrong shape for dst"); } return Status{}; @@ -356,19 +388,23 @@ ClComplexMulKernel::ClComplexMulKernel() _type = CLKernelType::ELEMENTWISE; } -void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClComplexMulKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_complex(src1, src2, dst, act_info)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); const TensorShape &out_shape = TensorShape::broadcast_shape(src1->tensor_shape(), src2->tensor_shape()); auto_init_if_empty(*dst, src1->clone()->set_tensor_shape(out_shape)); CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - if(act_info.enabled()) + if (act_info.enabled()) { build_opts.add_option("-DACTIVATION_TYPE=" + lower_string(string_from_activation_func(act_info.activation()))); build_opts.add_option("-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); @@ -384,7 +420,10 @@ void ClComplexMulKernel::configure(const CLCompileContext &compile_context, ITen ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClComplexMulKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClComplexMulKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_complex(src1, src2, dst, act_info)); @@ -397,26 +436,29 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl:: ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src_0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto src_1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src_0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto src_1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); const TensorShape &in_shape1 = src_0->info()->tensor_shape(); const TensorShape &in_shape2 = src_1->info()->tensor_shape(); const TensorShape &out_shape = dst->info()->tensor_shape(); bool can_collapse = true; - if(std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) + if (std::min(in_shape1.total_size(), in_shape2.total_size()) > 1) { can_collapse = (std::min(in_shape1.num_dimensions(), in_shape2.num_dimensions()) > Window::DimZ); - for(size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) + for (size_t d = Window::DimZ; can_collapse && (d < out_shape.num_dimensions()); ++d) { can_collapse = (in_shape1[d] == in_shape2[d]); } } bool has_collapsed = false; - Window collapsed = can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; + Window collapsed = + can_collapse ? window.collapse_if_possible(ICLKernel::window(), Window::DimZ, &has_collapsed) : window; const TensorShape &in_shape1_collapsed = has_collapsed ? in_shape1.collapsed_from(Window::DimZ) : in_shape1; const TensorShape &in_shape2_collapsed = has_collapsed ? in_shape2.collapsed_from(Window::DimZ) : in_shape2; @@ -435,8 +477,7 @@ void ClComplexMulKernel::run_op(ITensorPack &tensors, const Window &window, cl:: ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input1)); ARM_COMPUTE_UNUSED(collapsed.slide_window_slice_3D(slice_input2)); - } - while(collapsed.slide_window_slice_3D(slice)); + } while (collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClMulKernel.h b/src/gpu/cl/kernels/ClMulKernel.h index 4e62a6d67a..76a3ce02c1 100644 --- a/src/gpu/cl/kernels/ClMulKernel.h +++ b/src/gpu/cl/kernels/ClMulKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_MUL_KERNEL_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -72,16 +73,27 @@ public: * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMulKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -101,14 +113,21 @@ public: * @param[out] dst The dst tensor info. Data types supported: same as @p src1. Number of channels supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClComplexMulKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClPermuteKernel.cpp b/src/gpu/cl/kernels/ClPermuteKernel.cpp index 8d4655114b..a4755782ed 100644 --- a/src/gpu/cl/kernels/ClPermuteKernel.cpp +++ b/src/gpu/cl/kernels/ClPermuteKernel.cpp @@ -29,8 +29,9 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,13 +61,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const "Permutation up to 4-D src tensor is supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(perm.num_dimensions() < 1 || perm.num_dimensions() > 4, "Permutation vector size should be less than or equal to 4"); - for(const auto &p : perm) + for (const auto &p : perm) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(p >= perm.num_dimensions(), "Permutation vector has invalid values"); } // Validate configured dst - if(dst->total_size() != 0) + if (dst->total_size() != 0) { const TensorShape dst_shape = misc::shape_calculator::compute_permutation_output_shape(*src, perm); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(dst->tensor_shape(), dst_shape); @@ -82,10 +83,13 @@ ClPermuteKernel::ClPermuteKernel() _type = CLKernelType::ELEMENTWISE; } -void ClPermuteKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +void ClPermuteKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); const TensorShape dst_shape = get_dst_shape(src, perm); // Output auto initialization if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); @@ -96,7 +100,8 @@ void ClPermuteKernel::configure(const CLCompileContext &compile_context, const I // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type()))); + build_opts.add_option("-DDATA_TYPE=" + + get_cl_unsigned_type_from_element_size(data_size_from_type(src->data_type()))); build_opts.add_option("-DDEPTH_IN=" + support::cpp11::to_string(src->dimension(2))); // New positions of width(W), height(H), channel(C) and batch(D) based on permutation vector build_opts.add_option("-DP1=" + support::cpp11::to_string((_perm.num_dimensions() >= 1) ? perm[0] : 0)); @@ -126,8 +131,9 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window slice_in = window.first_slice_window_4D().collapse(ICLKernel::window(), 2, 4); @@ -144,9 +150,8 @@ void ClPermuteKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com add_4D_tensor_argument(idx, src, slice_in); add_4D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice_in, lws_hint()); - } - while(window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); + } while (window.slide_window_slice_4D(slice_in) && window.slide_window_slice_4D(slice_out)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClPermuteKernel.h b/src/gpu/cl/kernels/ClPermuteKernel.h index 0d349e739b..2413b10284 100644 --- a/src/gpu/cl/kernels/ClPermuteKernel.h +++ b/src/gpu/cl/kernels/ClPermuteKernel.h @@ -52,7 +52,10 @@ public: * @param[in] dst The dst tensor info. Data types supported: Same as @p src * @param[in] perm Permutation vector */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClPermuteKernel::configure() diff --git a/src/gpu/cl/kernels/ClPool2dKernel.cpp b/src/gpu/cl/kernels/ClPool2dKernel.cpp index a1afc585e0..41ab4d6922 100644 --- a/src/gpu/cl/kernels/ClPool2dKernel.cpp +++ b/src/gpu/cl/kernels/ClPool2dKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,37 +44,47 @@ using namespace arm_compute::misc::shape_calculator; namespace { -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), - "Unsupported combination of parameters!"); - - const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); - const bool is_global_pooling = pool_info.is_global_pooling; - unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - int output_width = 0; - int output_height = 0; - - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported"); - - std::tie(output_width, output_height) = scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - pool_size_x, pool_size_y, pool_info.pad_stride_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), "Calculated output dimension size is invalid"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (is_data_type_quantized_asymmetric(src->data_type()) && pool_info.pool_type == PoolingType::L2), + "Unsupported combination of parameters!"); + + const auto data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); + const bool is_global_pooling = pool_info.is_global_pooling; + unsigned int pool_size_x = is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + unsigned int pool_size_y = is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + int output_width = 0; + int output_height = 0; + + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_region_entirely_outside_input(pool_info), + "Pooling region that is entirely outside input tensor is unsupported"); + + std::tie(output_width, output_height) = + scaled_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], pool_size_x, + pool_size_y, pool_info.pad_stride_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1), + "Calculated output dimension size is invalid"); // Check indices - if(indices) + if (indices) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, "Pooling indices only supported for MAX pooling method"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), "Pooling indices only supported for pool size 2x2"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(pool_info.pool_type != PoolingType::MAX, + "Pooling indices only supported for MAX pooling method"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.pool_size != Size2D(2, 2)), + "Pooling indices only supported for pool size 2x2"); - if(indices->total_size() != 0) + if (indices->total_size() != 0) { TensorInfo idx_info(TensorInfo(compute_pool_shape(*src, pool_info), 1, DataType::U32)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(indices, &idx_info); @@ -81,7 +92,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const } // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -98,42 +109,47 @@ ClPool2dKernel::ClPool2dKernel() _type = CLKernelType::POOL; } -void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices) +void ClPool2dKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + ITensorInfo *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info, indices)); - auto padding_info = get_padding_info({ src, dst, indices }); + auto padding_info = get_padding_info({src, dst, indices}); // Auto init if empty TensorShape out_shape = compute_pool_shape(*src, pool_info); auto_init_if_empty(*dst, src->clone()->set_tensor_shape(out_shape)); - if(indices) + if (indices) { auto_init_if_empty(*indices, src->clone()->set_tensor_shape(out_shape).set_data_type(DataType::U32)); } // Set instance variables - _pool_info = pool_info; - _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; - _num_elems_processed_per_iteration = (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4); + _pool_info = pool_info; + _data_layout = pool_info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : pool_info.data_layout; + _num_elems_processed_per_iteration = + (_data_layout == DataLayout::NCHW) ? 1 : ((dst->data_type() == DataType::F32) ? 2 : 4); _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0)); - int pool_stride_x = 0; - int pool_stride_y = 0; - const PoolingType pool_type = pool_info.pool_type; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); - const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; - const bool exclude_padding = pool_info.exclude_padding; + int pool_stride_x = 0; + int pool_stride_y = 0; + const PoolingType pool_type = pool_info.pool_type; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); + const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const PadStrideInfo pad_stride_info = pool_info.pad_stride_info; + const bool exclude_padding = pool_info.exclude_padding; std::tie(pool_stride_x, pool_stride_y) = pad_stride_info.stride(); - const int pool_pad_top = pad_stride_info.pad_top(); - const int pool_pad_left = pad_stride_info.pad_left(); - const DataType data_type = src->data_type(); + const int pool_pad_top = pad_stride_info.pad_top(); + const int pool_pad_left = pad_stride_info.pad_left(); + const DataType data_type = src->data_type(); // Set build options CLBuildOptions build_opts; @@ -148,20 +164,23 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI build_opts.add_option("-DPOOL_SIZE_Y=" + support::cpp11::to_string(pool_size_y)); build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width))); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height))); - build_opts.add_option("-DMAX_WIDTH=" + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); - build_opts.add_option("-DMAX_HEIGHT=" + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); + build_opts.add_option("-DMAX_WIDTH=" + + support::cpp11::to_string(src->dimension(idx_width) + (exclude_padding ? 0 : pool_pad_left))); + build_opts.add_option("-DMAX_HEIGHT=" + + support::cpp11::to_string(src->dimension(idx_height) + (exclude_padding ? 0 : pool_pad_top))); // Tensor paddings are used to calculate the indicies for MAX pooling - if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && + is_data_type_float(data_type)) { build_opts.add_option("-DSRC_BATCH=" + support::cpp11::to_string(src->tensor_shape().total_size_lower(3))); } - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { build_opts.add_option("-DQUANTIZED"); - if(src->quantization_info() != dst->quantization_info()) + if (src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -174,9 +193,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } // Set the initial value for the pooling operation accordingly with the data type - if(pool_type == PoolingType::MAX) + if (pool_type == PoolingType::MAX) { - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { PixelValue type_min{}; std::tie(type_min, std::ignore) = get_min_max(data_type); @@ -184,7 +203,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } else { - std::string initial_value = pool_info.use_inf_as_limit ? "(-INFINITY)" : float_to_string_with_full_precision(std::numeric_limits<float>::lowest()); + std::string initial_value = pool_info.use_inf_as_limit + ? "(-INFINITY)" + : float_to_string_with_full_precision(std::numeric_limits<float>::lowest()); build_opts.add_option("-DINITIAL_VALUE=" + initial_value); } } @@ -195,22 +216,25 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } // Create kernel - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision; const auto use_wider_accumulator = use_fp_mixed_precision && (pool_type != PoolingType::MAX); - const auto acc_data_type = get_cl_type_from_data_type(use_wider_accumulator ? DataType::F32 : (is_data_type_quantized(data_type) ? DataType::S32 : data_type)); + const auto acc_data_type = get_cl_type_from_data_type( + use_wider_accumulator ? DataType::F32 + : (is_data_type_quantized(data_type) ? DataType::S32 : data_type)); build_opts.add_option("-DACC_DATA_TYPE=" + acc_data_type); build_opts.add_option_if(use_wider_accumulator, "-DFP_MIXED_PRECISION"); - if(pool_type != PoolingType::MAX) + if (pool_type != PoolingType::MAX) { build_opts.add_option_if(exclude_padding, "-DEXCLUDE_PADDING"); } - if(pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && is_data_type_float(data_type)) + if (pool_info.pool_size == Size2D(2, 2) && pool_type == PoolingType::MAX && indices && + is_data_type_float(data_type)) { // For max pooling with pool2x2, store indicies which will be used in max unpooling std::string kernel_name = "pooling_layer_2_nchw_indices"; @@ -226,18 +250,19 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI case DataLayout::NHWC: { // Floating point mixed precision is support on F16 only - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; + const auto use_fp_mixed_precision = + (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; // Wider accumulation is required to avoid accuracy loss // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) // Cast 2: Quantized (int8/uint8 src data and int32 accumulation ) DataType acc_data_type = data_type; - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { acc_data_type = DataType::F32; } - else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) + else if (is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) { acc_data_type = DataType::S32; } @@ -250,8 +275,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI build_opts.add_option("-DDST_HEIGHT=" + support::cpp11::to_string(dst->dimension(idx_height))); build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); - if(pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); + if (pool_info.pool_size == Size2D(2, 2) && is_data_type_float(data_type)) { build_opts.add_option_if(indices != nullptr && pool_type == PoolingType::MAX, "-DEXTRACT_MAX_INDEX"); @@ -260,7 +286,9 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI } else { - std::string kernel_name = is_data_type_quantized_asymmetric(data_type) ? "pooling_layer_MxN_quantized_nhwc" : "pooling_layer_MxN_nhwc"; + std::string kernel_name = is_data_type_quantized_asymmetric(data_type) + ? "pooling_layer_MxN_quantized_nhwc" + : "pooling_layer_MxN_nhwc"; _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); } break; @@ -290,7 +318,10 @@ void ClPool2dKernel::configure(const ClCompileContext &compile_context, ITensorI ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClPool2dKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices) +Status ClPool2dKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, pool_info, indices)); return Status{}; @@ -301,18 +332,19 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - unsigned int pool_stride_x = 0; - unsigned int pool_stride_y = 0; + unsigned int pool_stride_x = 0; + unsigned int pool_stride_y = 0; std::tie(pool_stride_x, pool_stride_y) = _pool_info.pad_stride_info.stride(); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0)); - auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0)); + auto indices = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_1)); // Collapse window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { @@ -323,13 +355,12 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm unsigned int idx = 0; add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) + if (indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_size == Size2D(2, 2))) { add_3D_tensor_argument(idx, indices, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); break; } case DataLayout::NHWC: @@ -338,7 +369,8 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm Window slice = window_collapsed.first_slice_window_4D(); Window in_slice = window_collapsed.first_slice_window_4D(); - in_slice.set(Window::DimX, Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); + in_slice.set(Window::DimX, + Window::Dimension(0, src->info()->dimension(0), _num_elems_processed_per_iteration)); in_slice.set(Window::DimY, Window::Dimension(0, src->info()->dimension(1), pool_stride_x)); in_slice.set(Window::DimZ, Window::Dimension(0, src->info()->dimension(2), pool_stride_y)); in_slice.set(3, Window::Dimension(0, batch_size, 1)); @@ -348,13 +380,13 @@ void ClPool2dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm unsigned int idx = 0; add_4D_tensor_argument(idx, src, in_slice); add_4D_tensor_argument(idx, dst, slice); - if(indices && is_data_type_float(src->info()->data_type()) && (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) + if (indices && is_data_type_float(src->info()->data_type()) && + (_pool_info.pool_type == PoolingType::MAX) && (_pool_info.pool_size == Size2D(2, 2))) { add_4D_tensor_argument(idx, indices, slice); } enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); + } while (window.slide_window_slice_4D(slice) && window.slide_window_slice_4D(in_slice)); break; } default: diff --git a/src/gpu/cl/kernels/ClPool2dKernel.h b/src/gpu/cl/kernels/ClPool2dKernel.h index f5bb0687e8..56b95a37d5 100644 --- a/src/gpu/cl/kernels/ClPool2dKernel.h +++ b/src/gpu/cl/kernels/ClPool2dKernel.h @@ -50,22 +50,29 @@ public: * @param[in] pool_info Contains pooling operation information described in @ref PoolingLayerInfo. * @param[out] indices (optional) The indices of the maximal values. Data type supported: U32. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &pool_info, ITensorInfo *indices = nullptr); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool2dKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &pool_info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &pool_info, + const ITensorInfo *indices = nullptr); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; public: PoolingLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - unsigned int _num_elems_processed_per_iteration{ 1 }; + DataLayout _data_layout{DataLayout::UNKNOWN}; + unsigned int _num_elems_processed_per_iteration{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClPool3dKernel.cpp b/src/gpu/cl/kernels/ClPool3dKernel.cpp index d068832fed..a08c5d4be7 100644 --- a/src/gpu/cl/kernels/ClPool3dKernel.cpp +++ b/src/gpu/cl/kernels/ClPool3dKernel.cpp @@ -28,6 +28,7 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -50,10 +51,13 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->data_layout() != DataLayout::NDHWC, "Only NDHWC layout supported"); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), "Strides cannot be zero."); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && (!pool_info.exclude_padding - && (pool_info.pool_type == PoolingType::AVG)), + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + (pool_info.stride.x() == 0 || pool_info.stride.y() == 0 || pool_info.stride.z() == 0), + "Strides cannot be zero."); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, + DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((!is_data_type_float(src->data_type())) && + (!pool_info.exclude_padding && (pool_info.pool_type == PoolingType::AVG)), "Exclude padding is unsupported for non-float types for Avg op"); const auto data_layout = src->data_layout(); @@ -68,17 +72,21 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const int output_height = 0; int output_depth = 0; - bool round_type_ceil_with_asymm_padding = (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding)); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, "Cannot use dimension round type CEIL when padding is asymmetric."); + bool round_type_ceil_with_asymm_padding = + (pool_info.round_type == DimensionRoundingType::CEIL) && (!is_symmetric(pool_info.padding)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(round_type_ceil_with_asymm_padding, + "Cannot use dimension round type CEIL when padding is asymmetric."); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), "Pooling region that is entirely outside input tensor is unsupported"); - std::tie(output_width, output_height, output_depth) = scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], - src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, - pool_size_z, pool_info); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(is_pool_3d_region_entirely_outside_input(pool_info), + "Pooling region that is entirely outside input tensor is unsupported"); + std::tie(output_width, output_height, output_depth) = + scaled_3d_dimensions_signed(src->tensor_shape()[idx_width], src->tensor_shape()[idx_height], + src->tensor_shape()[idx_depth], pool_size_x, pool_size_y, pool_size_z, pool_info); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), "Calculated output dimension size is invalid"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((output_width < 1 || output_height < 1 || output_depth < 1), + "Calculated output dimension size is invalid"); // Checks performed when dst is configured - if(dst->total_size() != 0) + if (dst->total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, dst); @@ -95,11 +103,14 @@ ClPool3dKernel::ClPool3dKernel() _type = CLKernelType::POOL; } -void ClPool3dKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info) +void ClPool3dKernel::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &pool_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, pool_info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Auto init if empty TensorShape out_shape = compute_pool3d_shape(src->tensor_shape(), pool_info); @@ -112,23 +123,23 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT _num_elems_processed_per_iteration = (dst->data_type() == DataType::F32) ? 2 : 4; _num_elems_processed_per_iteration = adjust_vec_size(_num_elems_processed_per_iteration, dst->dimension(0)); - const PoolingType pool_type = pool_info.pool_type; - const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); - const int idx_depth = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH); - const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); - const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); - const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; - const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; - const int pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth; - const bool exclude_padding = pool_info.exclude_padding; - const int pool_stride_x = pool_info.stride.x(); - const int pool_stride_y = pool_info.stride.y(); - const int pool_stride_z = pool_info.stride.z(); - const int pool_pad_top = pool_info.padding.top; - const int pool_pad_left = pool_info.padding.left; - const int pool_pad_front = pool_info.padding.front; - const DataType data_type = src->data_type(); + const PoolingType pool_type = pool_info.pool_type; + const int idx_width = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::HEIGHT); + const int idx_depth = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::DEPTH); + const int idx_channel = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::CHANNEL); + const int idx_batch_size = get_data_layout_dimension_index(_data_layout, DataLayoutDimension::BATCHES); + const int pool_size_x = pool_info.is_global_pooling ? src->dimension(idx_width) : pool_info.pool_size.width; + const int pool_size_y = pool_info.is_global_pooling ? src->dimension(idx_height) : pool_info.pool_size.height; + const int pool_size_z = pool_info.is_global_pooling ? src->dimension(idx_depth) : pool_info.pool_size.depth; + const bool exclude_padding = pool_info.exclude_padding; + const int pool_stride_x = pool_info.stride.x(); + const int pool_stride_y = pool_info.stride.y(); + const int pool_stride_z = pool_info.stride.z(); + const int pool_pad_top = pool_info.padding.top; + const int pool_pad_left = pool_info.padding.left; + const int pool_pad_front = pool_info.padding.front; + const DataType data_type = src->data_type(); // Set build options CLBuildOptions build_opts; @@ -149,7 +160,7 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT build_opts.add_option("-DSRC_DEPTH=" + support::cpp11::to_string(src->dimension(idx_depth))); // If datatype is quantized add relevant parameters - if(is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) + if (is_data_type_quantized_asymmetric(data_type) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -161,9 +172,9 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT } // Set the initial value for the pooling operation accordingly with the data type - if(pool_type == PoolingType::MAX) + if (pool_type == PoolingType::MAX) { - if(is_data_type_quantized(data_type)) + if (is_data_type_quantized(data_type)) { PixelValue type_min{}; std::tie(type_min, std::ignore) = get_min_max(data_type); @@ -171,7 +182,8 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT } else { - build_opts.add_option("-DINITIAL_VALUE=" + float_to_string_with_full_precision(std::numeric_limits<float>::lowest())); + build_opts.add_option("-DINITIAL_VALUE=" + + float_to_string_with_full_precision(std::numeric_limits<float>::lowest())); } } else @@ -181,16 +193,18 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT } // Create kernel // Floating point mixed precision is support on F16 only - const auto use_fp_mixed_precision = (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; + const auto use_fp_mixed_precision = + (data_type == DataType::F16) && pool_info.fp_mixed_precision && pool_type != PoolingType::MAX; // Wider accumulation is required to avoid accuracy loss // Case 1: Floating point mixed precision (fp16 src data and fp32 accumulation) DataType acc_data_type = data_type; - if(use_fp_mixed_precision) + if (use_fp_mixed_precision) { acc_data_type = DataType::F32; } - else if(is_data_type_quantized(data_type) && pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division + else if (is_data_type_quantized(data_type) && + pool_type != PoolingType::MAX) // Use S32 for avg pooling to allow for integer division { acc_data_type = DataType::S32; } @@ -202,11 +216,13 @@ void ClPool3dKernel::configure(const ClCompileContext &compile_context, const IT build_opts.add_option("-DDST_DEPTH=" + support::cpp11::to_string(dst->dimension(idx_depth))); build_opts.add_option("-DDST_CHANNELS=" + support::cpp11::to_string(dst->dimension(idx_channel))); build_opts.add_option("-DDST_BATCH_SIZE=" + support::cpp11::to_string(dst->dimension(idx_batch_size))); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % _num_elems_processed_per_iteration)); // if datatype is quantized use quantized kernel function - std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" : "pooling_3d_layer_MxN_ndhwc"); - _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); + std::string kernel_name = (is_data_type_quantized_asymmetric(data_type) ? "pooling_3d_layer_MxN_ndhwc_quantized" + : "pooling_3d_layer_MxN_ndhwc"); + _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window Window win = calculate_max_window(*dst, Steps(_num_elems_processed_per_iteration)); @@ -240,8 +256,9 @@ void ClPool3dKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comm ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST_0)); // Collapse 3D window Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); diff --git a/src/gpu/cl/kernels/ClPool3dKernel.h b/src/gpu/cl/kernels/ClPool3dKernel.h index 00852349e6..6cd229c427 100644 --- a/src/gpu/cl/kernels/ClPool3dKernel.h +++ b/src/gpu/cl/kernels/ClPool3dKernel.h @@ -50,7 +50,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src. * @param[in] pool_info Contains pooling operation information described in @ref Pooling3dLayerInfo. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &pool_info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &pool_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool3dKernel::configure() @@ -64,8 +67,8 @@ public: private: Pooling3dLayerInfo _pool_info{}; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - unsigned int _num_elems_processed_per_iteration{ 1 }; + DataLayout _data_layout{DataLayout::UNKNOWN}; + unsigned int _num_elems_processed_per_iteration{1}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClQuantizeKernel.cpp b/src/gpu/cl/kernels/ClQuantizeKernel.cpp index 5c8bf97f0f..e8df420f67 100644 --- a/src/gpu/cl/kernels/ClQuantizeKernel.cpp +++ b/src/gpu/cl/kernels/ClQuantizeKernel.cpp @@ -29,13 +29,12 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" - #include "support/Cast.h" #include "support/StringSupport.h" @@ -50,12 +49,14 @@ namespace Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F32, DataType::F16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F32, DataType::F16); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); // Output must always be initialized ARM_COMPUTE_RETURN_ERROR_ON(dst->tensor_shape().total_size() == 0); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QASYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QASYMM16); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(src, dst); return Status{}; @@ -71,7 +72,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); @@ -84,7 +85,7 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const float scale_to_apply = qinfo.scale; int32_t offset_to_apply = qinfo.offset; - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { /* * In case of requantization of a quantized input tensor to an output tensor with another quantization @@ -132,8 +133,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE_IN=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output_data_type)); - build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); - std::pair<int, int> min_max_quant_values = quantization::get_min_max_values_from_quantized_data_type(output_data_type); + build_opts.add_option_if( + multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(input_width_x - vec_size_x, 0))); + std::pair<int, int> min_max_quant_values = + quantization::get_min_max_values_from_quantized_data_type(output_data_type); build_opts.add_option("-DMIN_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.first)); build_opts.add_option("-DMAX_QUANT_VAL=" + support::cpp11::to_string(min_max_quant_values.second)); @@ -141,9 +144,10 @@ void ClQuantizeKernel::configure(const CLCompileContext &compile_context, const // Configure kernel window Window win = calculate_max_window(*src, Steps()); - if(multi_access_x) + if (multi_access_x) { - win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); + win.set(Window::DimX, + Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x)); } ICLKernel::configure_internal(win); @@ -173,8 +177,7 @@ void ClQuantizeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Co add_3D_tensor_argument(idx, src, slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClReshapeKernel.cpp b/src/gpu/cl/kernels/ClReshapeKernel.cpp index 121bb33edf..53889f3a6b 100644 --- a/src/gpu/cl/kernels/ClReshapeKernel.cpp +++ b/src/gpu/cl/kernels/ClReshapeKernel.cpp @@ -30,6 +30,7 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -51,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst) ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); ARM_COMPUTE_RETURN_ERROR_ON(src->data_type() == DataType::UNKNOWN); - if(dst->tensor_shape().total_size() != 0) + if (dst->tensor_shape().total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -72,27 +73,17 @@ void ClReshapeKernel::configure(const CLCompileContext &compile_context, const I ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Create kernel - std::set<std::string> build_opts = { "-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size()) }; + std::set<std::string> build_opts = {"-DDATA_TYPE=" + get_cl_unsigned_type_from_element_size(src->element_size())}; _kernel = create_kernel(compile_context, "reshape_layer", build_opts); // Add static arguments - const cl_int2 src_shape = - { - { - static_cast<cl_int>(src->tensor_shape()[0]), - static_cast<cl_int>(src->tensor_shape()[1]) - } - }; - const cl_int2 dst_shape = - { - { - static_cast<cl_int>(dst->tensor_shape()[0]), - static_cast<cl_int>(dst->tensor_shape()[1]) - } - }; + const cl_int2 src_shape = { + {static_cast<cl_int>(src->tensor_shape()[0]), static_cast<cl_int>(src->tensor_shape()[1])}}; + const cl_int2 dst_shape = { + {static_cast<cl_int>(dst->tensor_shape()[0]), static_cast<cl_int>(dst->tensor_shape()[1])}}; unsigned int idx = 2 * num_arguments_per_3D_tensor(); // Skip the src and dst parameters _kernel.setArg<cl_int2>(idx++, src_shape); _kernel.setArg<cl_int2>(idx++, dst_shape); @@ -119,8 +110,9 @@ void ClReshapeKernel::run_op(ITensorPack &tensors, const Window &window, cl::Com Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), Window::DimZ); Window slice = window_collapsed.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); // Set srcs unsigned int idx = 0; diff --git a/src/gpu/cl/kernels/ClReshapeKernel.h b/src/gpu/cl/kernels/ClReshapeKernel.h index db6ab5da58..95eae82086 100644 --- a/src/gpu/cl/kernels/ClReshapeKernel.h +++ b/src/gpu/cl/kernels/ClReshapeKernel.h @@ -58,7 +58,7 @@ public: // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; }; -} // namespace opencl } // namespace kernels +} // namespace opencl } // namespace arm_compute #endif /* ARM_COMPUTE_CL_RESHAPE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClScaleKernel.cpp b/src/gpu/cl/kernels/ClScaleKernel.cpp index 4c4373a215..4305acad26 100644 --- a/src/gpu/cl/kernels/ClScaleKernel.cpp +++ b/src/gpu/cl/kernels/ClScaleKernel.cpp @@ -27,8 +27,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" -#include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/utils/InterpolationPolicyUtils.h" +#include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" @@ -43,7 +44,8 @@ namespace kernels { namespace { -inline std::tuple<float, float> calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners) +inline std::tuple<float, float> +calculate_scale_factors(const ITensorInfo *src, const ITensorInfo *dst, DataLayout data_layout, bool align_corners) { const int idx_width = get_data_layout_dimension_index(data_layout, DataLayoutDimension::WIDTH); const int idx_height = get_data_layout_dimension_index(data_layout, DataLayoutDimension::HEIGHT); @@ -64,20 +66,25 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *dst, const { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::U8, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(dst == src); - ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels()!=1); - ARM_COMPUTE_RETURN_ERROR_ON(info.align_corners && !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); - ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && !is_data_type_quantized_asymmetric(src->data_type())); + ARM_COMPUTE_RETURN_ERROR_ON(src->num_channels() != 1); + ARM_COMPUTE_RETURN_ERROR_ON( + info.align_corners && + !arm_compute::scale_utils::is_align_corners_allowed_sampling_policy(info.sampling_policy)); + ARM_COMPUTE_RETURN_ERROR_ON(is_data_type_quantized(src->data_type()) && + !is_data_type_quantized_asymmetric(src->data_type())); float scale_x = 0.f; float scale_y = 0.f; const DataLayout data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; - std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners); + std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, data_layout, info.align_corners); - ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && (scale_x > 1.f || scale_y > 1.f)); + ARM_COMPUTE_RETURN_ERROR_ON(info.interpolation_policy == InterpolationPolicy::AREA && + (scale_x > 1.f || scale_y > 1.f)); return Status{}; } @@ -94,23 +101,26 @@ ClScaleKernel::ClScaleKernel() _type = CLKernelType::ELEMENTWISE; } -void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +void ClScaleKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ScaleKernelInfo &info) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Info required for the static tuning _data_layout = info.data_layout == DataLayout::UNKNOWN ? src->data_layout() : info.data_layout; const bool is_nhwc = _data_layout == DataLayout::NHWC; - float scale_x = 0.f; - float scale_y = 0.f; + float scale_x = 0.f; + float scale_y = 0.f; std::tie(scale_x, scale_y) = calculate_scale_factors(src, dst, _data_layout, info.align_corners); // Area interpolation behaves as Nearest Neighbour in case of up-sampling auto interpolation_policy_to_use = info.interpolation_policy; - if(info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f) + if (info.interpolation_policy == InterpolationPolicy::AREA && scale_x <= 1.f && scale_y <= 1.f) { interpolation_policy_to_use = InterpolationPolicy::NEAREST_NEIGHBOR; } @@ -127,7 +137,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn unsigned int vec_size_leftover = 0; CLBuildOptions build_opts; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { vec_size = adjust_vec_size(src->data_type() == DataType::F32 ? 4 : 8, dst_channels); vec_size_leftover = dst_channels % vec_size; @@ -135,7 +145,8 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn build_opts.add_option("-DSRC_DATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DDST_TENSOR_TYPE=BUFFER"); build_opts.add_option("-DDST_DATA_TYPE=" + get_cl_type_from_data_type(dst->data_type())); - build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + + string_from_pixel_value(info.constant_border_value, src->data_type())); build_opts.add_option("-DN0=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(vec_size_leftover)); build_opts.add_option("-DSCALE_" + string_from_interpolation_policy(interpolation_policy_to_use)); @@ -144,27 +155,33 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); build_opts.add_option_if(is_data_type_float(src->data_type()), "-DIS_FLOATING_POINT"); - build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); + build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", + "-DSAMPLING_POLICY_TOP_LEFT"); } - else if(_data_layout == DataLayout::NCHW) + else if (_data_layout == DataLayout::NCHW) { vec_size = adjust_vec_size(4, dst_width); vec_size_leftover = dst_width % vec_size; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); - build_opts.add_option("-DCONSTANT_VALUE=" + string_from_pixel_value(info.constant_border_value, src->data_type())); + build_opts.add_option("-DCONSTANT_VALUE=" + + string_from_pixel_value(info.constant_border_value, src->data_type())); build_opts.add_option("-DSRC_WIDTH=" + support::cpp11::to_string(src_width)); build_opts.add_option("-DSRC_HEIGHT=" + support::cpp11::to_string(src_height)); build_opts.add_option("-DSCALE_X=" + float_to_string_with_full_precision(scale_x)); build_opts.add_option("-DSCALE_Y=" + float_to_string_with_full_precision(scale_y)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) ? support::cpp11::to_string(vec_size) : support::cpp11::to_string(vec_size_leftover))); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + ((vec_size_leftover == 0) + ? support::cpp11::to_string(vec_size) + : support::cpp11::to_string(vec_size_leftover))); build_opts.add_option_if(info.border_mode == BorderMode::REPLICATE, "-DBORDER_MODE_REPLICATE"); build_opts.add_option_if(info.border_mode == BorderMode::CONSTANT, "-DBORDER_MODE_CONSTANT"); build_opts.add_option_if(info.align_corners, "-DALIGN_CORNERS"); - build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", "-DSAMPLING_POLICY_TOP_LEFT"); + build_opts.add_option_if_else(info.sampling_policy == SamplingPolicy::CENTER, "-DSAMPLING_POLICY_CENTER", + "-DSAMPLING_POLICY_TOP_LEFT"); - const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && info.interpolation_policy == InterpolationPolicy::BILINEAR; - if(is_qasymm_bilinear) + const bool is_qasymm_bilinear = is_data_type_quantized_asymmetric(src->data_type()) && + info.interpolation_policy == InterpolationPolicy::BILINEAR; + if (is_qasymm_bilinear) { const UniformQuantizationInfo qinfo = src->quantization_info().uniform(); build_opts.add_option("-DSCALE=" + support::cpp11::to_string(qinfo.scale)); @@ -190,7 +207,7 @@ void ClScaleKernel::configure(const CLCompileContext &compile_context, ITensorIn ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); // Pass scale kernel arguments - if(is_nhwc) + if (is_nhwc) { unsigned int idx = 2 * num_arguments_per_4d_tensor_nhwc(); _kernel.setArg<cl_float>(idx++, scale_x); @@ -219,7 +236,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); - switch(_data_layout) + switch (_data_layout) { case DataLayout::NCHW: { @@ -231,8 +248,7 @@ void ClScaleKernel::run_op(ITensorPack &tensors, const Window &window, cl::Comma add_2D_tensor_argument(idx, src, slice); add_2D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); break; } case DataLayout::NHWC: diff --git a/src/gpu/cl/kernels/ClScaleKernel.h b/src/gpu/cl/kernels/ClScaleKernel.h index dd09e92ee2..c09659017d 100644 --- a/src/gpu/cl/kernels/ClScaleKernel.h +++ b/src/gpu/cl/kernels/ClScaleKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SCALE_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -49,7 +50,8 @@ public: * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo Kernel descriptor to be used to configure. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClScaleKernel::configure() @@ -62,7 +64,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - DataLayout _data_layout{ DataLayout::UNKNOWN }; + DataLayout _data_layout{DataLayout::UNKNOWN}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp index 59299fa441..1b5a2666bc 100644 --- a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp +++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp @@ -22,12 +22,14 @@ * SOFTWARE. */ #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,15 +62,16 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float // Number of integer bits used in temporary fixed-point representation of exponent accumulator static const int exp_accumulation_in_bits = 12; - const double beta_multiplier = std::min( - 1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), - (1LL << 31) - 1.0); + const double beta_multiplier = + std::min(1.0 * beta * input_scale * (1 << (31 - scaled_diff_int_bits)), (1LL << 31) - 1.0); int input_beta_multiplier; int input_beta_left_shift; - quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, &input_beta_left_shift); + quantization::calculate_quantized_multiplier_greater_than_one(beta_multiplier, &input_beta_multiplier, + &input_beta_left_shift); - const double max_input_rescaled = 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift); - const int diff_min = -1.f * std::floor(max_input_rescaled); + const double max_input_rescaled = + 1.0 * ((1 << scaled_diff_int_bits) - 1) * (1LL << (31 - scaled_diff_int_bits)) / (1LL << input_beta_left_shift); + const int diff_min = -1.f * std::floor(max_input_rescaled); CLBuildOptions build_opts; build_opts.add_option("-DSCALED_DIFF_INT_BITS=" + support::cpp11::to_string(scaled_diff_int_bits)); @@ -80,18 +83,22 @@ CLBuildOptions prepare_quantized_softmax_build_options(float input_scale, float return build_opts; } -Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) +Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, + const ITensorInfo &max, + const ITensorInfo &dst, + const ITensorInfo &sum) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &max); const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(src.data_type()); // Checks performed when output is configured - if(dst.total_size() != 0) + if (dst.total_size() != 0) { - if(is_quantized_asymmetric) + if (is_quantized_asymmetric) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&dst, 1, DataType::S32); } @@ -103,9 +110,9 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor } // Checks performed when sum is configured - if(sum.total_size() != 0) + if (sum.total_size() != 0) { - if(is_quantized_asymmetric) + if (is_quantized_asymmetric) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&sum, 1, DataType::S32); } @@ -119,7 +126,10 @@ Status validate_arguments_1DMaxShiftExpSum(const ITensorInfo &src, const ITensor return Status{}; } -Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +Status validate_arguments_1DNorm(const ITensorInfo &src, + const ITensorInfo &sum, + const ITensorInfo &dst, + const SoftmaxKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::S32, DataType::F16, DataType::F32); @@ -127,14 +137,15 @@ Status validate_arguments_1DNorm(const ITensorInfo &src, const ITensorInfo &sum, ARM_COMPUTE_RETURN_ERROR_ON(info.is_log && !is_data_type_float(info.input_data_type)); // Note: output should always have a scale of 1/256 and offset 0 - const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); + const QuantizationInfo allowed_quantization_info = + get_softmax_output_quantization_info(info.input_data_type, info.is_log); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); // Checks performed when output is configured - if(dst.total_size() != 0) + if (dst.total_size() != 0) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); - if(!is_quantized_asymmetric) + if (!is_quantized_asymmetric) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); } @@ -161,9 +172,14 @@ ClLogits1DMaxShiftExpSumKernel::ClLogits1DMaxShiftExpSumKernel() _type = CLKernelType::ELEMENTWISE; } -void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info) +void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &max, + ITensorInfo &dst, + ITensorInfo &sum, + const SoftmaxKernelInfo &info) { - auto padding_info = get_padding_info({ &src, &max, &dst, &sum }); + auto padding_info = get_padding_info({&src, &max, &dst, &sum}); // Output auto initialization if not yet initialized auto_init_if_empty(sum, src.clone()->set_tensor_shape(max.tensor_shape())); @@ -191,15 +207,21 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c build_opts.add_option("-DLOG_VECTOR_SIZE=" + support::cpp11::to_string(lround(log2(vector_size)))); build_opts.add_option_if((reduction_dim_size % vector_size) != 0, "-DNON_MULTIPLE_OF_VECTOR_SIZE"); build_opts.add_option_if(is_signed_qasymm8, "-DQASYMM8_SIGNED"); - build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_option_if(is_data_type_float(dt) && (beta != 1.0f), + "-DBETA=" + float_to_string_with_full_precision(beta)); build_opts.add_option_if(is_data_type_float(dt) && info.is_log, "-DLOG_SOFTMAX"); - build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") : std::string("-FLT_MAX"))); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); - build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), "-DBETA=" + float_to_string_with_full_precision(beta)); - build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), prepare_quantized_softmax_build_options(qinfo.scale, beta).options()); + build_opts.add_option_if(is_data_type_float(dt), "-DMINVAL=" + ((dt == DataType::F16) ? std::string("-HALF_MAX") + : std::string("-FLT_MAX"))); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); + build_opts.add_option_if(is_data_type_quantized_asymmetric(dt), + "-DBETA=" + float_to_string_with_full_precision(beta)); + build_opts.add_options_if(is_data_type_quantized_asymmetric(dt), + prepare_quantized_softmax_build_options(qinfo.scale, beta).options()); cl::NDRange lws_hint(cl::NullRange); - std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial"; + std::string kernel_name = std::string("softmax_layer_max_shift_exp_sum_") + + (is_data_type_quantized_asymmetric(dt) ? "quantized_" : "") + "serial"; // Create kernel. _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); @@ -211,7 +233,10 @@ void ClLogits1DMaxShiftExpSumKernel::configure(const CLCompileContext &compile_c ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum) +Status ClLogits1DMaxShiftExpSumKernel::validate(const ITensorInfo &src, + const ITensorInfo &max, + const ITensorInfo &dst, + const ITensorInfo &sum) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DMaxShiftExpSum(src, max, dst, sum)); return Status{}; @@ -241,7 +266,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window & // Reconfigure window in case of parallel reduction ParallelReductionInfo parallel_reduction_info = is_parallel_reduction(src->info()->dimension(0)); - if(std::get<0>(parallel_reduction_info)) + if (std::get<0>(parallel_reduction_info)) { // Launch grid_size parallel work items window_collapsed.set(Window::DimX, Window::Dimension(0, _grid_size, 1)); @@ -258,8 +283,7 @@ void ClLogits1DMaxShiftExpSumKernel::run_op(ITensorPack &tensors, const Window & add_3D_tensor_argument(idx, dst, slice); add_3D_tensor_argument(idx, sum, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } ClLogits1DNormKernel::ClLogits1DNormKernel() @@ -267,18 +291,24 @@ ClLogits1DNormKernel::ClLogits1DNormKernel() _type = CLKernelType::ELEMENTWISE; } -void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info) +void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + const ITensorInfo &sum, + ITensorInfo &dst, + const SoftmaxKernelInfo &info) { - auto padding_info = get_padding_info({ &src, &dst, &sum }); + auto padding_info = get_padding_info({&src, &dst, &sum}); // Note: output should always have a scale of 1/256 and offset 0 - const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); - const DataType output_data_type = info.input_data_type; - const QuantizationInfo allowed_quantization_info = get_softmax_output_quantization_info(info.input_data_type, info.is_log); - const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); + const bool is_quantized_asymmetric = is_data_type_quantized_asymmetric(info.input_data_type); + const DataType output_data_type = info.input_data_type; + const QuantizationInfo allowed_quantization_info = + get_softmax_output_quantization_info(info.input_data_type, info.is_log); + const UniformQuantizationInfo qinfo = src.quantization_info().uniform(); // Output auto initialization if not yet initialized - auto_init_if_empty(dst, src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info)); + auto_init_if_empty(dst, + src.clone()->set_data_type(output_data_type).set_quantization_info(allowed_quantization_info)); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments_1DNorm(src, sum, dst, info)); @@ -311,7 +341,10 @@ void ClLogits1DNormKernel::configure(const CLCompileContext &compile_context, co ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClLogits1DNormKernel::validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info) +Status ClLogits1DNormKernel::validate(const ITensorInfo &src, + const ITensorInfo &sum, + const ITensorInfo &dst, + const SoftmaxKernelInfo &info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments_1DNorm(src, sum, dst, info)); @@ -343,9 +376,8 @@ void ClLogits1DNormKernel::run_op(ITensorPack &tensors, const Window &window, :: add_3D_tensor_argument(idx, sum, sum_slice); add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.h b/src/gpu/cl/kernels/ClSoftmaxKernel.h index a221e12132..2dd53da346 100644 --- a/src/gpu/cl/kernels/ClSoftmaxKernel.h +++ b/src/gpu/cl/kernels/ClSoftmaxKernel.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -61,14 +62,20 @@ public: * @param[out] sum Sum of 1D logits tensor. Data types supported: same as @p src * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &max, ITensorInfo &dst, ITensorInfo &sum, const SoftmaxKernelInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &max, + ITensorInfo &dst, + ITensorInfo &sum, + const SoftmaxKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClLogits1DMaxShiftExpSumKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum); + static Status + validate(const ITensorInfo &src, const ITensorInfo &max, const ITensorInfo &dst, const ITensorInfo &sum); /** Checks if the given size is eligible for parallel reduction * * @note Serial reduction is launched for width < (_grid_size * _serial_vector_size). @@ -100,14 +107,19 @@ public: * @param[out] dst Destination tensor. Data types supported: QASYMM8/QASYMM8_SIGNED for S32 @p input, or same as @p input * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, const ITensorInfo &sum, ITensorInfo &dst, const SoftmaxKernelInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + const ITensorInfo &sum, + ITensorInfo &dst, + const SoftmaxKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClLogits1DNormKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info); + static Status + validate(const ITensorInfo &src, const ITensorInfo &sum, const ITensorInfo &dst, const SoftmaxKernelInfo &info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; diff --git a/src/gpu/cl/kernels/ClTransposeKernel.cpp b/src/gpu/cl/kernels/ClTransposeKernel.cpp index 6450ffb5b2..6eb2bf81c0 100644 --- a/src/gpu/cl/kernels/ClTransposeKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposeKernel.cpp @@ -29,9 +29,10 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -58,12 +59,12 @@ void ClTransposeKernel::configure(const CLCompileContext &compile_context, const auto_init_if_empty(*dst, src->clone()->set_tensor_shape(dst_shape)); ARM_COMPUTE_ERROR_THROW_ON(ClTransposeKernel::validate(src, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Create kernel - const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); + const unsigned int vec_size_x = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(0)); const int vec_size_x_leftovers = src->dimension(0) % vec_size_x; - const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); + const unsigned int vec_size_y = adjust_vec_size(max_cl_vector_width / src->element_size(), src->dimension(1)); const int vec_size_y_leftovers = src->dimension(1) % vec_size_y; CLBuildOptions build_opts; @@ -89,9 +90,10 @@ Status ClTransposeKernel::validate(const ITensorInfo *src, const ITensorInfo *ds ARM_COMPUTE_RETURN_ERROR_ON_MSG(src->num_dimensions() > 2, "Transpose up to 2-D src tensor is supported"); // Validate configured dst - if(dst->total_size() != 0) + if (dst->total_size() != 0) { - const TensorInfo dst_info = src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src)); + const TensorInfo dst_info = + src->clone()->set_tensor_shape(misc::shape_calculator::compute_transposed_shape(*src)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(dst, &dst_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(src, dst); @@ -106,8 +108,9 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_MISMATCHING_WINDOWS(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_2D(); @@ -117,9 +120,8 @@ void ClTransposeKernel::run_op(ITensorPack &tensors, const Window &window, cl::C add_2D_tensor_argument(idx, src, slice); add_2D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_2D(slice)); + } while (window.slide_window_slice_2D(slice)); } } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp index ae825694c5..76f39ac500 100644 --- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp +++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.cpp @@ -26,14 +26,14 @@ #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/utils/quantization/AsymmHelpers.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" -#include "arm_compute/core/utils/quantization/AsymmHelpers.h" - namespace arm_compute { namespace opencl @@ -42,11 +42,15 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(input); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, DataType::QASYMM8_SIGNED, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F16, DataType::F32, + DataType::QASYMM8_SIGNED, DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, weights); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(input, DataLayout::NHWC); ARM_COMPUTE_RETURN_ERROR_ON_DATA_LAYOUT_NOT_IN(weights, DataLayout::NHWC); @@ -56,12 +60,13 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, constexpr unsigned int height_idx = 2; constexpr unsigned int batch_idx = 3; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), "Weights feature map dimension should match the respective src's one"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->dimension(channel_idx) != input->dimension(channel_idx), + "Weights feature map dimension should match the respective src's one"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights->num_dimensions() > 4, "Weights can be at most 4 dimensional"); - if(biases != nullptr) + if (biases != nullptr) { - if(is_data_type_quantized_asymmetric(input->data_type())) + if (is_data_type_quantized_asymmetric(input->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -77,15 +82,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { const size_t input_width = input->dimension(width_idx); const size_t input_height = input->dimension(height_idx); const size_t weights_width = weights->dimension(width_idx); const size_t weights_height = weights->dimension(height_idx); - auto out_dims = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); - TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); + auto out_dims = + deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); + TensorShape output_shape = + misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -96,8 +103,12 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *weights, } } // namespace -void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info) +void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_UNUSED(biases, deconv_info); ARM_COMPUTE_ERROR_ON_NULLPTR(input, weights, output); @@ -119,7 +130,8 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co const size_t output_channels = output->dimension(channel_idx); // Calculate output shape - auto out_dims = deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); + auto out_dims = + deconvolution_output_dimensions(input_width, input_height, weights_width, weights_height, deconv_info); TensorShape output_shape = misc::shape_calculator::compute_deconvolution_output_shape(out_dims, *input, *weights); auto_init_if_empty(*output, output_shape, 1, input->data_type(), input->quantization_info()); @@ -147,7 +159,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co const DataType input_data_type = input->data_type(); const PaddingInfo strides = deconv_info.stride(); - if(biases != nullptr) + if (biases != nullptr) { build_options.add_option(std::string("-DHAS_BIAS")); build_options.add_option(std::string("-DBIA_DATA_TYPE=" + get_cl_type_from_data_type(biases->data_type()))); @@ -180,7 +192,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co build_options.add_option("-DPARTIAL_N0=" + support::cpp11::to_string(partial_store_n0)); build_options.add_option_if((input_channels % k0) != 0, "-DLEFTOVER_LOOP"); - if(is_data_type_quantized(output_data_type)) + if (is_data_type_quantized(output_data_type)) { const UniformQuantizationInfo iqinfo = input->quantization_info().uniform(); const UniformQuantizationInfo wqinfo = weights->quantization_info().uniform(); @@ -210,7 +222,7 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co build_options.add_option("-DZERO_VALUE=" + support::cpp11::to_string(0)); } - if(compile_context.get_ddk_version() >= 30) + if (compile_context.get_ddk_version() >= 30) { build_options.add_option("-fregister-allocation=64"); } @@ -235,8 +247,11 @@ void ClTransposedConvolutionKernel::configure(const CLCompileContext &compile_co _config_id += support::cpp11::to_string(n0); } -Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *dst, const PadStrideInfo &deconv_info) +Status ClTransposedConvolutionKernel::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, deconv_info)); return Status{}; @@ -250,17 +265,20 @@ void ClTransposedConvolutionKernel::run_op(ITensorPack &tensors, const Window &w // Get initial windows Window slice = window.first_slice_window_3D(); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); - const auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); - const auto biases = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); + const auto weights = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + const auto biases = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_2)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); unsigned int idx = 0; add_4d_tensor_nhwc_argument(idx, src); add_4d_tensor_nhwc_argument(idx, dst); add_4d_tensor_nhwc_argument(idx, weights); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, slice); } diff --git a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h index d4350dda50..44f6f56b7a 100644 --- a/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h +++ b/src/gpu/cl/kernels/ClTransposedConvolutionKernel.h @@ -45,16 +45,23 @@ public: * Similar to @ref ClTransposedConvolution::configure() * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClTransposedConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -63,4 +70,4 @@ public: } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp index 8f36345076..af80c4d796 100644 --- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp +++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.cpp @@ -22,9 +22,11 @@ * SOFTWARE. */ #include "src/gpu/cl/kernels/ClWeightsReshapeKernel.h" + #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" @@ -39,7 +41,10 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, const ITensorInfo *output, unsigned int num_groups) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *biases, + const ITensorInfo *output, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_RETURN_ERROR_ON(input->data_type() == DataType::UNKNOWN); @@ -48,20 +53,24 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *biases, c ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4 && num_groups > 1); ARM_COMPUTE_RETURN_ERROR_ON((input->dimension(3) % num_groups) != 0); - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(!is_data_type_float(input->data_type())); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, biases); ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->num_dimensions() != 1)); ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->num_dimensions() != 2)); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && (biases->dimension(0) != input->tensor_shape()[3])); - ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 5) && (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4])); + ARM_COMPUTE_RETURN_ERROR_ON((input->num_dimensions() == 4) && + (biases->dimension(0) != input->tensor_shape()[3])); + ARM_COMPUTE_RETURN_ERROR_ON( + (input->num_dimensions() == 5) && + (biases->dimension(0) != input->tensor_shape()[3] || biases->dimension(1) != input->tensor_shape()[4])); } // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups)); + ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS( + output->tensor_shape(), compute_weights_reshaped_shape(*input, biases != nullptr, num_groups)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_QUANTIZATION_INFO(input, output); } @@ -75,16 +84,21 @@ ClWeightsReshapeKernel::ClWeightsReshapeKernel() _type = CLKernelType::ELEMENTWISE; } -void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups) +void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *biases, + ITensorInfo *dst, + unsigned int num_groups) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups))); + auto_init_if_empty( + *dst, src->clone()->set_tensor_shape(compute_weights_reshaped_shape(*src, (biases != nullptr), num_groups))); // Perform validation step ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, biases, dst, num_groups)); - auto padding_info = get_padding_info({ src, biases, dst }); + auto padding_info = get_padding_info({src, biases, dst}); const DataType data_type = src->data_type(); @@ -104,7 +118,10 @@ void ClWeightsReshapeKernel::configure(const ClCompileContext &compile_context, ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups) +Status ClWeightsReshapeKernel::validate(const ITensorInfo *src, + const ITensorInfo *biases, + const ITensorInfo *dst, + unsigned int num_groups) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, biases, dst, num_groups)); return Status{}; @@ -136,7 +153,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, _kernel.setArg<cl_uint>(idx++, src->info()->dimension(3)); _kernel.setArg<cl_uint>(idx++, dst->info()->strides_in_bytes().z()); - if(biases != nullptr) + if (biases != nullptr) { biases_window.use_tensor_dimensions(biases->info()->tensor_shape()); biases_slice = biases_window.first_slice_window_1D(); @@ -148,7 +165,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, unsigned idx = 0; add_3D_tensor_argument(idx, src, in_slice); add_2D_tensor_argument(idx, dst, out_slice); - if(biases != nullptr) + if (biases != nullptr) { add_1D_tensor_argument(idx, biases, biases_slice); ARM_COMPUTE_UNUSED(biases_window.slide_window_slice_1D(biases_slice)); @@ -156,8 +173,7 @@ void ClWeightsReshapeKernel::run_op(ITensorPack &tensors, const Window &window, // Run kernel enqueue(queue, *this, in_slice, lws_hint()); - } - while(window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); + } while (window.slide_window_slice_4D(in_slice) && out_window.slide_window_slice_2D(out_slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h index 7364eb97ae..5e05f8d006 100644 --- a/src/gpu/cl/kernels/ClWeightsReshapeKernel.h +++ b/src/gpu/cl/kernels/ClWeightsReshapeKernel.h @@ -75,14 +75,19 @@ public: * @param[in] num_groups (Optional) Number of groups when performing a grouped convolution. num_groups != 1 is only supported for NCHW data layout * Number of groups greater than one are only supported for NCHW data layout, and the number of weights must be a multiple of it. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *biases, ITensorInfo *dst, unsigned int num_groups = 1); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *biases, + ITensorInfo *dst, + unsigned int num_groups = 1); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWeightsReshapeKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1); + static Status + validate(const ITensorInfo *src, const ITensorInfo *biases, const ITensorInfo *dst, unsigned int num_groups = 1); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -90,4 +95,4 @@ public: } // namespace kernels } // namespace opencl } // namespace arm_compute -#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */
\ No newline at end of file +#endif /*ARM_COMPUTE_CL_WEIGHTSRESHAPE_KERNEL_H */ diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp index 0a9a3f021f..15195025ce 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp +++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.cpp @@ -29,11 +29,11 @@ #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/tensor_info.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -52,7 +52,7 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, dst); ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) > dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); @@ -63,7 +63,8 @@ Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, cons } } // namespace -Status ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) +Status +ClWidthConcatenate2TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, dst)); return Status{}; @@ -74,12 +75,15 @@ ClWidthConcatenate2TensorsKernel::ClWidthConcatenate2TensorsKernel() _type = CLKernelType::ELEMENTWISE; } -void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst) +void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, dst)); - auto padding_info = get_padding_info({ src1, src2, dst }); + auto padding_info = get_padding_info({src1, src2, dst}); const unsigned int min_dimension = std::min(src1->dimension(0), src2->dimension(0)); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); @@ -91,11 +95,12 @@ void ClWidthConcatenate2TensorsKernel::configure(const CLCompileContext &compile build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); - build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); // If input have different quantization info set quantization parameters needed for the re-quantization process const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2); - if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) + if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) { const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); @@ -146,9 +151,11 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window Window slice = window.first_slice_window_4D(); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); do { @@ -159,8 +166,7 @@ void ClWidthConcatenate2TensorsKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg<cl_int>(idx++, _depth); _kernel.setArg<cl_int>(idx++, _input1_width); enqueue(queue, *this, window, lws_hint()); - } - while(window.slide_window_slice_4D(slice)); + } while (window.slide_window_slice_4D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h index 5c54479002..8b53d6d66b 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h +++ b/src/gpu/cl/kernels/ClWidthConcatenate2TensorsKernel.h @@ -62,8 +62,8 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - int32_t _depth{ 0 }; - int32_t _input1_width{ 0 }; + int32_t _depth{0}; + int32_t _input1_width{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp index 54f7ad344a..c4f84e3e45 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp +++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.cpp @@ -30,11 +30,11 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "src/core/utils/helpers/tensor_info.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -45,15 +45,20 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) +Status validate_arguments(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *src3, + const ITensorInfo *src4, + const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(src1); ARM_COMPUTE_RETURN_ERROR_ON(src1->data_type() == DataType::UNKNOWN); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src1, src2, src3, src4, dst); - ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > dst->dimension(0)); + ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(0) + src2->dimension(0) + src3->dimension(0) + src4->dimension(0) > + dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src1->dimension(i) != dst->dimension(i)); ARM_COMPUTE_RETURN_ERROR_ON(src2->dimension(i) != dst->dimension(i)); @@ -71,22 +76,29 @@ ClWidthConcatenate4TensorsKernel::ClWidthConcatenate4TensorsKernel() _type = CLKernelType::ELEMENTWISE; } -Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst) +Status ClWidthConcatenate4TensorsKernel::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *src3, + const ITensorInfo *src4, + const ITensorInfo *dst) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src1, src2, src3, src4, dst)); return Status{}; } void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile_context, - ITensorInfo *src1, ITensorInfo *src2, - ITensorInfo *src3, ITensorInfo *src4, - ITensorInfo *dst) + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *src3, + ITensorInfo *src4, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src1, src2, src3, src4, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src1, src2, src3, src4, dst)); - auto padding_info = get_padding_info({ src1, src2, src3, src4, dst }); - const unsigned int min_dimension = std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0))); + auto padding_info = get_padding_info({src1, src2, src3, src4, dst}); + const unsigned int min_dimension = + std::min(std::min(src1->dimension(0), src2->dimension(0)), std::min(src3->dimension(0), src4->dimension(0))); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(8, min_dimension); const unsigned int vec_size_leftover = dst->dimension(0) % num_elems_processed_per_iteration; @@ -96,9 +108,14 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(src1->element_size())); - build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - build_opts.add_option("-DINPUT2_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); - build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + src3->dimension(0) - vec_size_leftover) % num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT1_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT2_ROTATE_N=" + + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); + build_opts.add_option("-DINPUT3_ROTATE_N=" + support::cpp11::to_string((src1->dimension(0) + src2->dimension(0) + + src3->dimension(0) - vec_size_leftover) % + num_elems_processed_per_iteration)); _depth = src1->dimension(2); _input1_width = src1->dimension(0); @@ -106,8 +123,9 @@ void ClWidthConcatenate4TensorsKernel::configure(const CLCompileContext &compile _input3_width = src3->dimension(0); // If soources have different quantization info set quantization parameters needed for the re-quantization process - const bool have_different_qinfo = helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4); - if(is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) + const bool have_different_qinfo = + helpers::tensor_info::tensors_have_different_quantization_info(dst, src1, src2, src3, src4); + if (is_data_type_quantized_asymmetric(src1->data_type()) && have_different_qinfo) { const UniformQuantizationInfo iq1_info = src1->quantization_info().uniform(); const UniformQuantizationInfo iq2_info = src2->quantization_info().uniform(); @@ -166,11 +184,15 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src0 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); - const auto src1 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); - const auto src2 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2)); - const auto src3 = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src0 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC)); + const auto src1 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 1)); + const auto src2 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 2)); + const auto src3 = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_VEC + 3)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); Window slice = window.first_slice_window_4D(); @@ -187,8 +209,7 @@ void ClWidthConcatenate4TensorsKernel::run_op(ITensorPack &tensors, const Window _kernel.setArg<cl_int>(idx++, _input2_width); _kernel.setArg<cl_int>(idx++, _input3_width); enqueue(queue, *this, window, lws_hint()); - } - while(window.slide_window_slice_4D(slice)); + } while (window.slide_window_slice_4D(slice)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h index baf8d381be..f589b8ac1a 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h +++ b/src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h @@ -52,23 +52,32 @@ public: * @param[in] src4 Fourth source tensor info. Data types supported: same as @p src1 * @param[out] dst Destination tensor info. Data types supported: same as @p src1. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *src3, ITensorInfo *src4, ITensorInfo *dst); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *src3, + ITensorInfo *src4, + ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClWidthConcatenate4TensorsKernel::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *src3, const ITensorInfo *src4, const ITensorInfo *dst); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *src3, + const ITensorInfo *src4, + const ITensorInfo *dst); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - int32_t _depth{ 0 }; - int32_t _input1_width{ 0 }; - int32_t _input2_width{ 0 }; - int32_t _input3_width{ 0 }; + int32_t _depth{0}; + int32_t _input1_width{0}; + int32_t _input2_width{0}; + int32_t _input3_width{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp index 2dfe7fce52..989de4a7b7 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp +++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.cpp @@ -30,10 +30,10 @@ #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" - #include "support/StringSupport.h" namespace arm_compute @@ -53,7 +53,7 @@ Status validate_arguments(const ITensorInfo *src, unsigned int width_offset, con ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, dst); ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(0) + width_offset > dst->dimension(0)); - for(size_t i = 1; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = 1; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON(src->dimension(i) != dst->dimension(i)); } @@ -74,12 +74,15 @@ Status ClWidthConcatenateKernel::validate(const ITensorInfo *src, unsigned int w return Status{}; } -void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst) +void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + unsigned int width_offset, + ITensorInfo *dst) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, width_offset, dst)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); const unsigned int num_elems_processed_per_iteration = adjust_vec_size(16, src->dimension(0)); @@ -87,10 +90,11 @@ void ClWidthConcatenateKernel::configure(const CLCompileContext &compile_context CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); - build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); + build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + + support::cpp11::to_string(src->dimension(0) % num_elems_processed_per_iteration)); build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(width_offset)); - if(is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) + if (is_data_type_quantized_asymmetric(src->data_type()) && src->quantization_info() != dst->quantization_info()) { const UniformQuantizationInfo iqinfo = src->quantization_info().uniform(); const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform(); @@ -121,8 +125,9 @@ void ClWidthConcatenateKernel::run_op(ITensorPack &tensors, const Window &window ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - const auto src = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); - auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); + const auto src = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC)); + auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST)); unsigned int idx = 0; add_4D_tensor_argument(idx, src, window); diff --git a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h index 3ace4400e6..c10d6a4dc6 100644 --- a/src/gpu/cl/kernels/ClWidthConcatenateKernel.h +++ b/src/gpu/cl/kernels/ClWidthConcatenateKernel.h @@ -50,7 +50,8 @@ public: * @param[in,out] dst Destination tensor info. Data types supported: same as @p src. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, unsigned int width_offset, ITensorInfo *dst); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClWidthConcatenateKernel::configure() @@ -63,7 +64,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, ::cl::CommandQueue &queue) override; private: - int32_t _depth{ 0 }; + int32_t _depth{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp index 7148a4c85c..58c01d4da5 100644 --- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.cpp @@ -29,10 +29,11 @@ #include "arm_compute/core/Helpers.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" +#include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" -#include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/WindowHelpers.h" @@ -60,14 +61,18 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const size_t idx_w = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::WIDTH); const size_t idx_h = get_data_layout_dimension_index(input->data_layout(), DataLayoutDimension::HEIGHT); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd filter transform not supported"); - ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || input->dimension(idx_h) != kernel_size.height); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), + "Winograd filter transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(idx_w) != kernel_size.width || + input->dimension(idx_h) != kernel_size.height); ARM_COMPUTE_RETURN_ERROR_ON(input->num_dimensions() > 4); // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*input, winograd_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -81,11 +86,15 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(output); - const unsigned int num_elems_processed_per_iteration_x = input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1; + const unsigned int num_elems_processed_per_iteration_x = + input->data_layout() == DataLayout::NCHW ? input->dimension(0) : 1; const unsigned int num_elems_processed_per_iteration_y = input->dimension(1); - const unsigned int num_elems_read_per_iteration_z = input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2); + const unsigned int num_elems_read_per_iteration_z = + input->data_layout() == DataLayout::NCHW ? 1 : input->dimension(2); - Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, num_elems_read_per_iteration_z)); + Window win = + calculate_max_window(*input, Steps(num_elems_processed_per_iteration_x, num_elems_processed_per_iteration_y, + num_elems_read_per_iteration_z)); Window win_collapsed = win.collapse(win, Window::DimZ); return std::make_pair(Status{}, win_collapsed); } @@ -96,21 +105,25 @@ ClWinogradFilterTransformKernel::ClWinogradFilterTransformKernel() _type = CLKernelType::WINOGRAD; } -void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) +void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info))); + auto_init_if_empty(*dst, + src->clone()->set_tensor_shape(compute_winograd_filter_transform_shape(*src, winograd_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); // Set build options CLBuildOptions build_opts; // For NHWC layouts pass tensor dimesions at runtime - if(src->data_layout() == DataLayout::NHWC) + if (src->data_layout() == DataLayout::NHWC) { _src_dim_z = src->dimension(2); } @@ -125,7 +138,8 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_ const Size2D output_tile_size = winograd_info.output_tile_size; // Create kernel - std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout())); + std::string kernel_name = "winograd_filter_transform_" + output_tile_size.to_string() + "_" + + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(src->data_layout())); // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -138,7 +152,9 @@ void ClWinogradFilterTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info)); } -Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) +Status ClWinogradFilterTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get()).first); @@ -161,7 +177,7 @@ void ClWinogradFilterTransformKernel::run_op(ITensorPack &tensors, const Window unsigned int idx = 0; add_4D_tensor_argument(idx, src, window); add_3D_tensor_argument(idx, dst, window_out); - if(src->info()->data_layout() == DataLayout::NHWC) + if (src->info()->data_layout() == DataLayout::NHWC) { _kernel.setArg<cl_uint>(idx++, _src_dim_z); } diff --git a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h index b2130304e6..6e439f0c99 100644 --- a/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h +++ b/src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRAD_FILTER_TRANSFORM_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -59,7 +60,10 @@ public: * @param[out] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_filter_transform_shape. Data types supported: Same as @p input * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWinogradFilterTransformKernel::configure() @@ -72,7 +76,7 @@ public: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; private: - int32_t _src_dim_z{ 0 }; + int32_t _src_dim_z{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp index fab6c36032..54c48986fc 100644 --- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.cpp @@ -32,6 +32,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -55,17 +56,21 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, "Winograd input transform only supports unit strides"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), "Winograd input transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv_info.stride().first != 1 || conv_info.stride().second != 1, + "Winograd input transform only supports unit strides"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, input->data_layout()), + "Winograd input transform not supported"); ARM_COMPUTE_UNUSED(conv_info); ARM_COMPUTE_UNUSED(output_tile_size); ARM_COMPUTE_UNUSED(kernel_size); // Validate configured output - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); + const TensorShape output_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*input, winograd_info); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DIMENSIONS(output->tensor_shape(), output_shape); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -74,7 +79,8 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output, c return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) +std::pair<Status, Window> +validate_and_configure_window(ITensorInfo *input, ITensorInfo *output, const WinogradInfo &winograd_info) { ARM_COMPUTE_UNUSED(output); ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); @@ -82,7 +88,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen bool window_changed = false; int num_elems_processed_per_iteration = 1; - if(input->data_layout() == DataLayout::NHWC) + if (input->data_layout() == DataLayout::NHWC) { // In the case of FP16 computation, we can perform more // output feature maps in a single work-item. @@ -94,9 +100,9 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const size_t dim0 = input->dimension(0); const size_t k_sz = winograd_info.kernel_size.area(); const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); - if(cond) + if (cond) { - if(k_sz == 3 || k_sz == 9) + if (k_sz == 3 || k_sz == 9) { num_elems_processed_per_iteration = 2; } @@ -104,7 +110,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen } Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); - if(input->data_layout() == DataLayout::NCHW) + if (input->data_layout() == DataLayout::NCHW) { const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; @@ -113,11 +119,13 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen unsigned int num_elems_read_per_iteration_x = output_tile_size.width + kernel_size.width - 1; unsigned int num_elems_read_per_iteration_y = output_tile_size.height + kernel_size.height - 1; - AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); + AccessWindowRectangle input_access(input, -conv_info.pad_left(), -conv_info.pad_top(), + num_elems_read_per_iteration_x, num_elems_read_per_iteration_y); window_changed = update_window_and_padding(win, input_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -132,12 +140,15 @@ BorderSize ClWinogradInputTransformKernel::border_size() const return _border_size; } -void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info) +void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, dst, winograd_info)); - auto padding_info = get_padding_info({ src, dst }); + auto padding_info = get_padding_info({src, dst}); const PadStrideInfo conv_info = winograd_info.convolution_info; const Size2D output_tile_size = winograd_info.output_tile_size; @@ -150,14 +161,13 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c // Compute the number of output tiles along the x and y direction of size "output_tile_size" const Size2D num_tiles = compute_winograd_convolution_tiles(Size2D(src->dimension(idx_w), src->dimension(idx_h)), - kernel_size, - output_tile_size, - conv_info); + kernel_size, output_tile_size, conv_info); _num_tiles_x = num_tiles.width; _num_tiles_y = num_tiles.height; - const TensorShape output_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorShape output_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); // Output auto initialization if not yet initialized auto_init_if_empty(*dst, src->clone()->set_tensor_shape(output_shape)); @@ -174,7 +184,7 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c _src_height = src->dimension(idx_h); CLBuildOptions build_opts; - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { build_opts.add_option("-DNHWC"); build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); @@ -201,13 +211,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c } // Create kernel - std::string kernel_name = "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); + std::string kernel_name = + "winograd_input_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string(); // Get the maximum dimension from the tile size const unsigned int tile_max_dim = std::max(output_tile_size.width, output_tile_size.height); // Check optimized kernel if output_dims == 2x2 - if((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) + if ((tile_max_dim == 2) && (_data_layout == DataLayout::NCHW)) { _step_z = (src->dimension(2) % 2) != 0 ? 1 : 2; } @@ -239,11 +250,14 @@ void ClWinogradInputTransformKernel::configure(const ClCompileContext &compile_c _config_id += lower_string(string_from_data_layout(_data_layout)); } -Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info) +Status ClWinogradInputTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const WinogradInfo &winograd_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, dst, winograd_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_and_configure_window(src->clone().get(), dst->clone().get(), winograd_info).first); return Status{}; } @@ -263,7 +277,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window & // Collapse window Window window_collapsed = window.collapse_if_possible(IClKernel::window(), Window::DimZ); - if(_data_layout == DataLayout::NHWC) + if (_data_layout == DataLayout::NHWC) { Window slice = window_collapsed.first_slice_window_3D(); slice.set(1, Window::Dimension(0, _num_tiles_x * _num_tiles_y, 1)); @@ -298,8 +312,7 @@ void ClWinogradInputTransformKernel::run_op(ITensorPack &tensors, const Window & add_3D_tensor_argument(idx, dst, slice); enqueue(queue, *this, slice, lws_hint()); - } - while(window_collapsed.slide_window_slice_3D(slice)); + } while (window_collapsed.slide_window_slice_3D(slice)); } } } // namespace kernels diff --git a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h index c10c528b9b..cebebea1d3 100644 --- a/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h +++ b/src/gpu/cl/kernels/ClWinogradInputTransformKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRAD_INPUT_TRANSFORM_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -59,7 +60,10 @@ public: * @param[in] dst The output tensor info. The shape for this tensor can be calculated using the utility function @p compute_winograd_input_transform_shape. Data types supported: Same as @p input * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const WinogradInfo &winograd_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const WinogradInfo &winograd_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWinogradInputTransformKernel::configure() @@ -69,19 +73,19 @@ public: static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const WinogradInfo &winograd_info); // Inherited methods overridden: - void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; + void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; BorderSize border_size() const override; private: using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>; - BorderSize _border_size{ 0 }; - DataLayout _data_layout{ DataLayout::UNKNOWN }; - int _num_tiles_x{ 0 }; - int _num_tiles_y{ 0 }; - unsigned int _step_z{ 1 }; - int32_t _src_width{ 0 }; - int32_t _src_height{ 0 }; + BorderSize _border_size{0}; + DataLayout _data_layout{DataLayout::UNKNOWN}; + int _num_tiles_x{0}; + int _num_tiles_y{0}; + unsigned int _step_z{1}; + int32_t _src_width{0}; + int32_t _src_height{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp index bf974d30d8..89c80c55ef 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.cpp @@ -23,7 +23,6 @@ */ #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" -#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/CL/ICLTensor.h" @@ -31,10 +30,12 @@ #include "arm_compute/core/IAccessWindow.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" -#include "arm_compute/core/Validate.h" -#include "arm_compute/core/Window.h" +#include "arm_compute/core/utils/ActivationFunctionUtils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/StringUtils.h" +#include "arm_compute/core/Validate.h" +#include "arm_compute/core/Window.h" + #include "src/core/AccessWindowStatic.h" #include "src/core/CL/CLValidate.h" #include "src/core/helpers/AutoConfiguration.h" @@ -54,7 +55,11 @@ namespace kernels { namespace { -Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, const ITensorInfo *output, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +Status validate_arguments(const ITensorInfo *input, + const ITensorInfo *bias, + const ITensorInfo *output, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_UNUSED(act_info); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::F32, DataType::F16); @@ -66,30 +71,32 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con const Size2D output_tile_size = winograd_info.output_tile_size; const Size2D kernel_size = winograd_info.kernel_size; const Size2D input_dimensions = winograd_info.input_dimensions; - const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); + const unsigned int num_channels = (winograd_info.kernel_size.width + winograd_info.output_tile_size.width - 1) * + (winograd_info.kernel_size.height + winograd_info.output_tile_size.height - 1); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), "Winograd output transform not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !cl_winograd_convolution_layer_supported(output_tile_size, kernel_size, winograd_info.output_data_layout), + "Winograd output transform not supported"); ARM_COMPUTE_RETURN_ERROR_ON_MSG(input->dimension(2) != num_channels, "Wrong number of channels"); // Compute number of elements to process in the X and Y direction // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); + const Size2D num_tiles = + compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(1) != static_cast<unsigned int>((num_tiles.area()))); - if(bias != nullptr) + if (bias != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, bias); ARM_COMPUTE_RETURN_ERROR_ON(input->dimension(0) != bias->dimension(0)); } // Checks performed when output is configured - if(output->total_size() != 0) + if (output->total_size() != 0) { - const TensorInfo tensor_info_output = input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); + const TensorInfo tensor_info_output = + input->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*input, winograd_info)); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(output, &tensor_info_output); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(input, output); @@ -98,14 +105,17 @@ Status validate_arguments(const ITensorInfo *input, const ITensorInfo *bias, con return Status{}; } -std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITensorInfo *bias, ITensorInfo *output, const Size2D &output_tile_size) +std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, + ITensorInfo *bias, + ITensorInfo *output, + const Size2D &output_tile_size) { ARM_COMPUTE_ERROR_ON_NULLPTR(input, output); ARM_COMPUTE_UNUSED(bias); unsigned int num_elems_processed_per_iteration = 1; - if(input->data_layout() == DataLayout::NHWC) + if (input->data_layout() == DataLayout::NHWC) { // In the case of FP16 computation, we can perform more // output feature maps in a single work-item. @@ -115,7 +125,7 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen const DataType dt = input->data_type(); const size_t dim0 = input->dimension(0); const bool cond = dt == DataType::F16 && ((dim0 % 2) == 0); - if(cond) + if (cond) { num_elems_processed_per_iteration = 2; } @@ -124,17 +134,19 @@ std::pair<Status, Window> validate_and_configure_window(ITensorInfo *input, ITen Window win = calculate_max_window(*input, Steps(num_elems_processed_per_iteration)); bool window_changed = false; - if(output->data_layout() == DataLayout::NCHW) + if (output->data_layout() == DataLayout::NCHW) { const int output_static_window_end_x = ceil_to_multiple(output->dimension(0), output_tile_size.width); const int output_static_window_end_y = ceil_to_multiple(output->dimension(1), output_tile_size.height); - AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, num_elems_processed_per_iteration); + AccessWindowRectangle input_access(input, 0, 0, num_elems_processed_per_iteration, + num_elems_processed_per_iteration); AccessWindowStatic output_access(output, 0, 0, output_static_window_end_x, output_static_window_end_y); window_changed = update_window_and_padding(win, input_access, output_access); } - Status err = (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; + Status err = + (window_changed) ? ARM_COMPUTE_CREATE_ERROR(ErrorCode::RUNTIME_ERROR, "Insufficient Padding!") : Status{}; return std::make_pair(err, win); } } // namespace @@ -144,13 +156,18 @@ ClWinogradOutputTransformKernel::ClWinogradOutputTransformKernel() _type = CLKernelType::WINOGRAD; } -void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, +void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); // Output tensor auto initialization if not yet initialized - auto_init_if_empty(*dst, src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); + auto_init_if_empty(*dst, + src->clone()->set_tensor_shape(compute_winograd_output_transform_shape(*src, winograd_info))); ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, bias, dst, winograd_info, act_info)); @@ -159,7 +176,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_THROW_ON(win_config.first); IClKernel::configure_internal(win_config.second); - auto padding_info = get_padding_info({ src, bias, dst }); + auto padding_info = get_padding_info({src, bias, dst}); _is_nhwc = winograd_info.output_data_layout == DataLayout::NHWC; @@ -168,14 +185,13 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ const Size2D kernel_size = winograd_info.kernel_size; const Size2D output_tile_size = winograd_info.output_tile_size; const PadStrideInfo conv_info = winograd_info.convolution_info; - const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); - const int idx_height = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); + const int idx_width = get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::WIDTH); + const int idx_height = + get_data_layout_dimension_index(winograd_info.output_data_layout, DataLayoutDimension::HEIGHT); // Compute the number of output tiles along the x and y direction of size "output_tile_size" - const Size2D num_tiles = compute_winograd_convolution_tiles(input_dimensions, - kernel_size, - output_tile_size, - conv_info); + const Size2D num_tiles = + compute_winograd_convolution_tiles(input_dimensions, kernel_size, output_tile_size, conv_info); const size_t total_batches = dst->tensor_shape().total_size_upper(3); // Set build options @@ -184,11 +200,11 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option_if(act_info.enabled(), "-DA_VAL=" + float_to_string_with_full_precision(act_info.a())); build_opts.add_option_if(act_info.enabled(), "-DB_VAL=" + float_to_string_with_full_precision(act_info.b())); - if((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) + if ((output_tile_size.x() == 2) || (output_tile_size.x() == 1 && output_tile_size.y() == 2)) { build_opts.add_option("-DVEC_SIZE=2"); } - else if((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) + else if ((output_tile_size.x() == 4) || (output_tile_size.x() == 1 && output_tile_size.y() == 4)) { build_opts.add_option("-DVEC_SIZE=4"); } @@ -200,9 +216,10 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ const auto act_function = act_info.activation(); const auto src_data_type = src->data_type(); - if((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) - && (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) - && (src_data_type == DataType::F32 || src_data_type == DataType::F16)) + if ((gpu_target != GPUTarget::G71 && (gpu_target & GPUTarget::GPU_ARCH_MASK) == GPUTarget::BIFROST) && + (act_function == ActivationLayerInfo::ActivationFunction::BOUNDED_RELU || + act_function == ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU) && + (src_data_type == DataType::F32 || src_data_type == DataType::F16)) { // -cl-fast-relaxed-math also sets -cl-finite-math-only and -cl-unsafe-math-optimizations // to disable -cl-finite-math-only, we only include -cl-unsafe-math-optimizations @@ -213,7 +230,7 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ build_opts.add_option("-cl-fast-relaxed-math"); } - if(_is_nhwc) + if (_is_nhwc) { build_opts.add_option_if(bias != nullptr, std::string("-DHAS_BIAS")); build_opts.add_option("-DN0=" + support::cpp11::to_string(win_config.second.x().step())); @@ -247,7 +264,9 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ _dst_height = dst->dimension(idx_height); // Create kernel - std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + kernel_size.to_string() + "_" + lower_string(string_from_data_layout(winograd_info.output_data_layout)); + std::string kernel_name = "winograd_output_transform_" + output_tile_size.to_string() + "_" + + kernel_size.to_string() + "_" + + lower_string(string_from_data_layout(winograd_info.output_data_layout)); // A macro guard to compile ONLY the kernel of interest build_opts.add_option("-D" + upper_string(kernel_name)); @@ -271,10 +290,18 @@ void ClWinogradOutputTransformKernel::configure(const ClCompileContext &compile_ ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info) && _is_nhwc); } -Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info) +Status ClWinogradOutputTransformKernel::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info) { - ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); - ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), (bias != nullptr ? bias->clone().get() : nullptr), dst->clone().get(), winograd_info.output_tile_size).first); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_arguments(src, (bias != nullptr ? bias->clone().get() : nullptr), dst, winograd_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_and_configure_window(src->clone().get(), + (bias != nullptr ? bias->clone().get() : nullptr), + dst->clone().get(), winograd_info.output_tile_size) + .first); return Status{}; } @@ -299,7 +326,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window slice_out.set(Window::DimX, Window::Dimension(0, 0, 0)); slice_out.set(Window::DimY, Window::Dimension(0, 0, 0)); - if(bias != nullptr) + if (bias != nullptr) { unsigned int idx1 = 2 * num_arguments_per_4D_tensor(); Window slice_biases; @@ -307,7 +334,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window add_1D_tensor_argument(idx1, bias, slice_biases); } - if(_is_nhwc) + if (_is_nhwc) { unsigned int idx2 = 2 * num_arguments_per_4D_tensor() + ((bias != nullptr) ? num_arguments_per_1D_tensor() : 0); _kernel.setArg(idx2++, static_cast<int>(dst->info()->total_size() - dst->info()->strides_in_bytes().y())); @@ -322,8 +349,7 @@ void ClWinogradOutputTransformKernel::run_op(ITensorPack &tensors, const Window add_4D_tensor_argument(idx, src, slice); add_4D_tensor_argument(idx, dst, slice_out); enqueue(queue, *this, slice, lws_hint()); - } - while(window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); + } while (window.slide_window_slice_3D(slice) && window.slide_window_slice_3D(slice_out)); } } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h index 6f018967d0..65bb963061 100644 --- a/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h +++ b/src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRAD_OUTPUT_TRANSFORM_KERNEL_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/core/common/Macros.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" @@ -61,7 +62,11 @@ public: * @param[in] winograd_info Contains Winograd's information described in @ref WinogradInfo * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *bias, ITensorInfo *dst, const WinogradInfo &winograd_info, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *bias, + ITensorInfo *dst, + const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -69,7 +74,11 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const WinogradInfo &winograd_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const WinogradInfo &winograd_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) override; @@ -77,11 +86,11 @@ public: private: using WinogradKey = std::pair<std::pair<int, int>, std::pair<int, int>>; - bool _is_nhwc{ false }; - int32_t _src_height{ 0 }; - int32_t _dst_width{ 0 }; - int32_t _dst_height{ 0 }; - int32_t _num_tiles_x{ 0 }; + bool _is_nhwc{false}; + int32_t _src_height{0}; + int32_t _dst_width{0}; + int32_t _dst_height{0}; + int32_t _num_tiles_x{0}; }; } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp index 9350bf74bb..b5ebac3b49 100644 --- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.cpp @@ -39,14 +39,24 @@ namespace kernels { namespace gemm { -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, + unsigned int n, + unsigned int m0, + unsigned int n0, + unsigned int k0, + unsigned int v0, + unsigned int h0, + bool lhs_interleave, + bool rhs_interleave, + bool lhs_transpose, + bool rhs_transpose, + bool export_to_cl_image) { ARM_COMPUTE_ERROR_ON(m0 == 0 || n0 == 0); ARM_COMPUTE_ERROR_ON(v0 == 0); v0 = std::max(std::min(static_cast<int>(m / m0), static_cast<int>(v0)), static_cast<int>(1)); - if(h0 == 0) + if (h0 == 0) { // When h0 is 0, we should take the maximum H0 possible h0 = std::max(n / n0, 1U); @@ -62,17 +72,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned return std::make_pair(lhs_info, rhs_info); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type) { - ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, "The fallback GeMM configuration cannot have export_to_cl_image = true"); + ARM_COMPUTE_ERROR_ON_MSG(info_buf.second.export_to_cl_image == true, + "The fallback GeMM configuration cannot have export_to_cl_image = true"); const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, data_type); const TensorShape shape = misc::shape_calculator::compute_rhs_reshaped_shape(tensor_rhs_info, info_img.second); const TensorInfo tensor_reshaped_info(shape, 1, data_type); - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, info_img.second))) { return info_img; } @@ -90,42 +105,56 @@ void update_padding_for_cl_image(ITensorInfo *tensor) const unsigned int pixel_alignment = get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()); ARM_COMPUTE_ERROR_ON_MSG(pixel_alignment == 0, "Cannot retrieve cl_image pitch alignment"); - if(pixel_alignment == 0) + if (pixel_alignment == 0) { return; } const unsigned int row_pitch_alignment = pixel_alignment * num_floats_per_pixel; - const unsigned int round_up_width = ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; - const unsigned int padding = round_up_width - stride_y_in_elements; + const unsigned int round_up_width = + ((stride_y_in_elements + row_pitch_alignment - 1) / row_pitch_alignment) * row_pitch_alignment; + const unsigned int padding = round_up_width - stride_y_in_elements; tensor->extend_padding(PaddingSize(0, tensor->padding().right + padding, 0, 0)); } Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, const GEMMRHSMatrixInfo &rhs_info) { - if(rhs_info.export_to_cl_image) + if (rhs_info.export_to_cl_image) { - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, "Export to cl_image only supported with n0 = 4, 8 or 16"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, "Export to cl_image only supported with k0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.n0 == 2) || (rhs_info.n0 == 3)) && rhs_info.transpose == false, + "Export to cl_image only supported with n0 = 4, 8 or 16"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(((rhs_info.k0 == 2) || (rhs_info.k0 == 3)) && rhs_info.transpose == true, + "Export to cl_image only supported with k0 = 4, 8 or 16"); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN(&tensor_reshaped_info, DataType::F32, DataType::F16); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(!image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, "Impossible to retrieve the cl_image pitch alignment"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + !image2d_from_buffer_supported(CLKernelLibrary::get().get_device()), + "The extension cl_khr_image2d_from_buffer is not supported on the target platform"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(get_cl_image_pitch_alignment(CLKernelLibrary::get().get_device()) == 0, + "Impossible to retrieve the cl_image pitch alignment"); // Check the width and height of the output tensor. // Since we cannot create a 3d image from a buffer, the third dimension is collapsed on the second dimension const size_t max_image_w = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_WIDTH>(); const size_t max_image_h = CLKernelLibrary::get().get_device().getInfo<CL_DEVICE_IMAGE2D_MAX_HEIGHT>(); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, "Not supported width for cl_image"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, "Not supported height for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(tensor_reshaped_info.tensor_shape()[0] > max_image_w * 4, + "Not supported width for cl_image"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + tensor_reshaped_info.tensor_shape()[1] * tensor_reshaped_info.tensor_shape()[2] > max_image_h, + "Not supported height for cl_image"); } return Status{}; } -bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b, - const DataType data_type, unsigned int &best_m0, unsigned int &best_n0) +bool is_mmul_kernel_preferred(const unsigned int m, + const unsigned int n, + const unsigned int k, + const unsigned int b, + const DataType data_type, + unsigned int &best_m0, + unsigned int &best_n0) { ARM_COMPUTE_UNUSED(n, k, b, data_type); @@ -141,7 +170,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const return ((k % mmul_k0) == 0) && (gws_y > 4); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b) { size_t min_acc = std::numeric_limits<size_t>::max(); size_t min_idx = 0; @@ -150,12 +180,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf const size_t num_rows = configs.size(); const size_t num_cols = configs[0].size(); - ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS"); + ARM_COMPUTE_ERROR_ON_MSG(num_cols != 14U, "The entry should have 14 integer values representing: M, N, K, B, M0, " + "N0. K0, V0, H0, INT_LHS, INT_RHS, TRA_LHS, TRA_RHS, IMG_RHS"); ARM_COMPUTE_UNUSED(num_cols); // Find nearest GeMM workload // Note: the workload does not depend on the K dimension - for(size_t y = 0; y < num_rows; ++y) + for (size_t y = 0; y < num_rows; ++y) { size_t mc0 = static_cast<size_t>(configs[y][0]); size_t nc0 = static_cast<size_t>(configs[y][1]); @@ -168,7 +199,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConf acc += (k - kc0) * (k - kc0); acc += (b - bc0) * (b - bc0); acc = std::sqrt(acc); - if(acc < min_acc) + if (acc < min_acc) { min_acc = acc; min_idx = y; diff --git a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h index 6689b10e69..84776fb207 100644 --- a/src/gpu/cl/kernels/gemm/ClGemmHelpers.h +++ b/src/gpu/cl/kernels/gemm/ClGemmHelpers.h @@ -54,8 +54,18 @@ using GeMMConfigsMatrix = std::vector<std::vector<int32_t>>; * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, unsigned int n, unsigned int m0, unsigned int n0, unsigned int k0, unsigned int v0, unsigned int h0, - bool lhs_interleave, bool rhs_interleave, bool lhs_transpose, bool rhs_transpose, bool export_to_cl_image = false); +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned int m, + unsigned int n, + unsigned int m0, + unsigned int n0, + unsigned int k0, + unsigned int v0, + unsigned int h0, + bool lhs_interleave, + bool rhs_interleave, + bool lhs_transpose, + bool rhs_transpose, + bool export_to_cl_image = false); /** Select @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo * @@ -72,9 +82,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_lhs_rhs_info(unsigned * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, - unsigned int n, unsigned int k, unsigned int b, DataType data_type); +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +select_lhs_rhs_info(std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_img, + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> info_buf, + unsigned int n, + unsigned int k, + unsigned int b, + DataType data_type); /** Update padding required to export the OpenCL buffer to OpenCL image2d * @@ -103,8 +117,13 @@ Status validate_image2d_support_on_rhs(const ITensorInfo &tensor_reshaped_info, * * @return true if MMUL kernel is preferred over kernels w/o MMUL, false otherwise */ -bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const unsigned int k, const unsigned int b, - const DataType data_type, unsigned int &best_m0, unsigned int &best_n0); +bool is_mmul_kernel_preferred(const unsigned int m, + const unsigned int n, + const unsigned int k, + const unsigned int b, + const DataType data_type, + unsigned int &best_m0, + unsigned int &best_n0); /** Find the preferred configurations for the LHS and RHS tensor using the GeMMConfigsMatrix provided by the user * @@ -116,7 +135,8 @@ bool is_mmul_kernel_preferred(const unsigned int m, const unsigned int n, const * * @return @ref GEMMLHSMatrixInfo and @ref GEMMRHSMatrixInfo */ -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +find_lhs_rhs_info(const GeMMConfigsMatrix &configs, unsigned int m, unsigned int n, unsigned int k, unsigned int b); } // namespace gemm } // namespace kernels } // namespace opencl diff --git a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h index a49836cfda..9d08633963 100644 --- a/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/IClGemmKernelConfig.h @@ -26,6 +26,7 @@ #include "arm_compute/core/GPUTarget.h" #include "arm_compute/core/Types.h" + #include "src/core/common/Macros.h" #include <array> @@ -56,8 +57,7 @@ public: * @param[in] func_int8 Function to call for GEMM Int8 (QASYMM8, QASYMM8_SIGNED, QSYMM8_PER_CHANNEL) * */ - CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) - : _configs{ func_f32, func_f16, func_int8 } + CLGEMMConfigArray(T func_f32, T func_f16, T func_int8) : _configs{func_f32, func_f16, func_int8} { } @@ -69,7 +69,7 @@ public: */ T get_function(DataType data_type) { - switch(data_type) + switch (data_type) { case DataType::F32: return _configs.at(DT_F32); @@ -96,8 +96,7 @@ public: * * @param[in] arch GPU target */ - IClGemmKernelConfig(GPUTarget arch) - : _target(arch) + IClGemmKernelConfig(GPUTarget arch) : _target(arch) { } ARM_COMPUTE_DISALLOW_COPY_ALLOW_MOVE(IClGemmKernelConfig); @@ -111,7 +110,8 @@ public: * @param[in] b Batch size * @param[in] data_type Data type */ - virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; + virtual std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) = 0; protected: GPUTarget _target; diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp index d74c7fac9b..2f37eef31f 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,31 +39,34 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeBifrost::ClGemmDefaultConfigNativeBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigNativeBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71(&ClGemmDefaultConfigNativeBifrost::configure_G71_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G71( + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G71_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G71_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigNativeBifrost::configure_G76_f32, - &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76( + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, + &ClGemmDefaultConfigNativeBifrost::configure_G76_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_G76_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigNativeBifrost::configure_default_f32, - &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic - &ClGemmDefaultConfigNativeBifrost::configure_default_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, + &ClGemmDefaultConfigNativeBifrost::configure_default_f32, // We use the F32 heuristic + &ClGemmDefaultConfigNativeBifrost::configure_default_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -79,18 +83,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -105,20 +110,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -129,7 +135,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -141,9 +147,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } else { - if(m == 1) + if (m == 1) { - if(n < 8192) + if (n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -159,24 +165,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n > 4196) + if (n > 4196) { return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 1, false, false, false, false); } else { - if(k < 2048) + if (k < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 1, false, false, false, false); } - else if(k >= 2048 && k < 16384) + else if (k >= 2048 && k < 16384) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } @@ -192,18 +199,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -214,7 +222,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -225,7 +233,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -233,7 +242,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 1, false, false, false, false); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeBifrost::configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -243,4 +253,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeBifrost } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h index 9af5dc4135..f822daae53 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeBifrost.h @@ -45,15 +45,22 @@ public: ClGemmDefaultConfigNativeBifrost(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G71_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G71_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_default_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_default_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp index b9f36c7210..f87fb1b659 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,18 +39,17 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeMidgard::ClGemmDefaultConfigNativeMidgard(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigNativeMidgard::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, - nullptr, + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(nullptr, nullptr, &ClGemmDefaultConfigNativeMidgard::default_q8); auto func = configs_default.get_function(data_type); @@ -57,7 +57,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeMidgard::default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -70,4 +71,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeMidgard } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h index c055753c48..fa76c5dba7 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeMidgard.h @@ -45,10 +45,12 @@ public: ClGemmDefaultConfigNativeMidgard(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + default_q8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp index 95a4d2bd69..97a1298b0a 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,37 +39,38 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigNativeValhall::ClGemmDefaultConfigNativeValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigNativeValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default(&ClGemmDefaultConfigNativeValhall::configure_G77_f32, - &ClGemmDefaultConfigNativeValhall::configure_G77_f16, - &ClGemmDefaultConfigNativeValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_default( + &ClGemmDefaultConfigNativeValhall::configure_G77_f32, &ClGemmDefaultConfigNativeValhall::configure_G77_f16, + &ClGemmDefaultConfigNativeValhall::configure_G77_u8); auto func = configs_default.get_function(data_type); ARM_COMPUTE_ERROR_ON_MSG(func == nullptr, "Data type not support for GEMM"); return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -83,18 +85,19 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 8192) + else if (n >= 2048 && n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, 1, false, false, false, false); } @@ -109,20 +112,21 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigNativeValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { - if(n < 2048) + if (n < 2048) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 1, false, false, false, false); } - else if(n >= 2048 && n < 16384) + else if (n >= 2048 && n < 16384) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -133,7 +137,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } else { - if(m < 64) + if (m < 64) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 1, false, false, false, false); } @@ -145,9 +149,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } else { - if(m == 1) + if (m == 1) { - if(n < 8192) + if (n < 8192) { return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 1, false, false, false, false); } @@ -165,4 +169,4 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigNativeValhall } // namespace gemm } // namespace kernels } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h index f0f812fd46..c91b095279 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmDefaultConfigNativeValhall.h @@ -45,12 +45,16 @@ public: ClGemmDefaultConfigNativeValhall(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h index cf8412830b..955bb3c01a 100644 --- a/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/native/ClGemmNativeKernelConfig.h @@ -51,7 +51,7 @@ public: */ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: return std::make_unique<ClGemmDefaultConfigNativeMidgard>(gpu); diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp index 657018eb53..c956c347ef 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.cpp @@ -29,6 +29,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -43,30 +44,31 @@ namespace gemm { using namespace arm_compute::misc::shape_calculator; -ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigReshapedBifrost::ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52( + &ClGemmDefaultConfigReshapedBifrost::configure_G52_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76( + &ClGemmDefaultConfigReshapedBifrost::configure_G76_f32, &ClGemmDefaultConfigReshapedBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedBifrost::configure_G76_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -83,12 +85,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); } @@ -98,12 +101,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 8, 2, true, true, true, false); } @@ -113,14 +117,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 2, 2, true, false, false, true); } @@ -131,7 +136,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } else { - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 2, 2, true, false, false, true); } @@ -142,7 +147,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; @@ -154,100 +160,108 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(workload <= 274.4000f) + if (workload <= 274.4000f) { - if(r_nk <= 0.7461f) + if (r_nk <= 0.7461f) { - if(r_mn <= 21.1667f) + if (r_mn <= 21.1667f) { return configure_lhs_rhs_info(m, n, 4, 2, 4, 4, 4, false, true, true, false, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_mk <= 17.3926f) + if (r_mk <= 17.3926f) { - if(workload <= 542.4000f) + if (workload <= 542.4000f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_nk <= 0.5463f) + if (r_nk <= 0.5463f) { - if(workload <= 11767.6001f) + if (workload <= 11767.6001f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, true, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 323.4000f) + if (workload <= 323.4000f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 4, 8, false, false, false, true, false); } @@ -257,7 +271,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -268,7 +283,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro GEMMRHSMatrixInfo rhs_info_img; // Get lhs_info/rhs_info in case of OpenCL buffer - if(n <= 4) + if (n <= 4) { std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, true, false, false, true); } @@ -279,15 +294,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro // Get lhs_info/rhs_info in case of OpenCL image // Condition on the GPU workload - if((m / 4) * (n / 4) >= 2560) + if ((m / 4) * (n / 4) >= 2560) { // Big workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 8, true, true, true, false, true); } else { // Small workload - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 1, true, true, true, false, true); } const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); @@ -297,7 +314,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro // In case of vector by matrix with few work-items, we use the OpenCL buffer rather than the OpenCL image2d const bool use_cl_image2d = (n <= 4) ? false : true; - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) { return std::make_pair(lhs_info_img, rhs_info_img); } @@ -307,16 +324,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_mk = static_cast<float>(m) / static_cast<float>(k); - if(workload <= 1595.2000f) + if (workload <= 1595.2000f) { - if(r_mk <= 2.1044f) + if (r_mk <= 2.1044f) { - if(workload <= 870.4000f) + if (workload <= 870.4000f) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 2, true, false, true, false, false); } @@ -336,12 +354,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifro } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, false, false, false, true); } diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h index d86d1ba0a7..9227ec2551 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedBifrost.h @@ -45,17 +45,26 @@ public: ClGemmDefaultConfigReshapedBifrost(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp index 58d0873b86..70b324eb5a 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.cpp @@ -26,6 +26,7 @@ #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/CLKernelLibrary.h" #include "arm_compute/core/GPUTarget.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" #include <utility> @@ -38,26 +39,27 @@ namespace kernels { namespace gemm { -ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) - : IClGemmKernelConfig(gpu) +ClGemmDefaultConfigReshapedValhall::ClGemmDefaultConfigReshapedValhall(GPUTarget gpu) : IClGemmKernelConfig(gpu) { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClGemmDefaultConfigReshapedValhall::configure_G77_f32, &ClGemmDefaultConfigReshapedValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClGemmDefaultConfigReshapedValhall::configure_G78_f32, &ClGemmDefaultConfigReshapedValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedValhall::configure_G77_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G78: func = configs_G78.get_function(data_type); @@ -72,12 +74,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 8, 16, 16, 1, 0, 0, 1); } @@ -87,7 +90,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -104,17 +108,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 0); - if(r_mk <= 0.11824845522642136) + if (r_mk <= 0.11824845522642136) { - if(workload <= 880.0) + if (workload <= 880.0) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); } else { - if(r_nk <= 0.42521367967128754) + if (r_nk <= 0.42521367967128754) { - if(workload <= 1726.4000244140625) + if (workload <= 1726.4000244140625) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 0); } @@ -123,13 +127,12 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else { - if(workload <= 1241.6000366210938) + if (workload <= 1241.6000366210938) { return configure_lhs_rhs_info(m, n, 2, 4, 4, 1, 4, 0, 0, 1, 0, 0); } @@ -142,17 +145,16 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 11404.7998046875) + if (workload <= 11404.7998046875) { - if(r_mk <= 1.0126488208770752) + if (r_mk <= 1.0126488208770752) { - if(r_mn <= 2.545312523841858) + if (r_mn <= 2.545312523841858) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -161,43 +163,39 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 2881.199951171875) + if (workload <= 2881.199951171875) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 2, 0, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } else { - if(r_nk <= 0.5765306055545807) + if (r_nk <= 0.5765306055545807) { - if(r_mn <= 6.010416746139526) + if (r_mn <= 6.010416746139526) { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 0, 1, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else @@ -205,27 +203,27 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 1, 1, 0, 1, 0, 1); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 1288.0000f) + if (workload <= 1288.0000f) { - if(workload <= 505.6000f) + if (workload <= 505.6000f) { - if(r_mn <= 0.4466f) + if (r_mn <= 0.4466f) { - if(r_nk <= 0.2384f) + if (r_nk <= 0.2384f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -241,9 +239,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mn <= 0.2250f) + if (r_mn <= 0.2250f) { - if(r_mn <= 0.1599f) + if (r_mn <= 0.1599f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -254,11 +252,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 0.7609f) + if (r_mk <= 0.7609f) { - if(r_mn <= 2.5453f) + if (r_mn <= 2.5453f) { - if(workload <= 1089.6000f) + if (workload <= 1089.6000f) { return configure_lhs_rhs_info(m, n, 2, 4, 8, 4, 4, 0, 0, 1, 0, 1); } @@ -281,29 +279,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 5434.4001f) + if (workload <= 5434.4001f) { - if(workload <= 1603.2000f) + if (workload <= 1603.2000f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_nk <= 0.6192f) + if (r_nk <= 0.6192f) { - if(r_mn <= 16.1016f) + if (r_mn <= 16.1016f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(workload <= 2750.0000f) + if (workload <= 2750.0000f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_mk <= 6.3151f) + if (r_mk <= 6.3151f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); } @@ -316,15 +314,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 0.0387f) + if (r_mk <= 0.0387f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } else { - if(r_mk <= 2.5859f) + if (r_mk <= 2.5859f) { - if(r_mk <= 0.2734f) + if (r_mk <= 0.2734f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } @@ -343,13 +341,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 25.7500f) + if (r_mk <= 25.7500f) { - if(r_mk <= 0.3615f) + if (r_mk <= 0.3615f) { - if(r_mn <= 0.0913f) + if (r_mn <= 0.0913f) { - if(r_mk <= 0.0683f) + if (r_mk <= 0.0683f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); } @@ -365,15 +363,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 11174.3999f) + if (workload <= 11174.3999f) { - if(r_mk <= 0.8047f) + if (r_mk <= 0.8047f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(workload <= 7185.5999f) + if (workload <= 7185.5999f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 1, 0, 1); } @@ -385,9 +383,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 17917.5000f) + if (workload <= 17917.5000f) { - if(r_mk <= 1.5078f) + if (r_mk <= 1.5078f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -398,7 +396,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 34449.6016f) + if (workload <= 34449.6016f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -412,11 +410,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_mk <= 331.1111f) + if (r_mk <= 331.1111f) { - if(workload <= 53397.5996f) + if (workload <= 53397.5996f) { - if(r_mn <= 57.8063f) + if (r_mn <= 57.8063f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -427,7 +425,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(r_nk <= 0.9211f) + if (r_nk <= 0.9211f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 4, 2, 0, 0, 1, 0, 1); } @@ -439,7 +437,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 38070.4004f) + if (workload <= 38070.4004f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 4, 4, 0, 0, 0, 1, 1); } @@ -453,27 +451,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 801.6000f) + if (workload <= 801.6000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } else { - if(r_mn <= 0.1211f) + if (r_mn <= 0.1211f) { - if(workload <= 3296.0000f) + if (workload <= 3296.0000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } else { - if(r_nk <= 1.0625f) + if (r_nk <= 1.0625f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -485,15 +484,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 5068.8000f) + if (workload <= 5068.8000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } else { - if(r_nk <= 0.2361f) + if (r_nk <= 0.2361f) { - if(workload <= 12630.0000f) + if (workload <= 12630.0000f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, 0, 0, 1, 0, 1); } @@ -504,7 +503,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } else { - if(workload <= 178790.3984f) + if (workload <= 178790.3984f) { return configure_lhs_rhs_info(m, n, 8, 4, 4, 2, 2, 0, 0, 1, 0, 1); } @@ -518,12 +517,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValha } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +ClGemmDefaultConfigReshapedValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(n <= 4) + if (n <= 4) { return configure_lhs_rhs_info(m, n, 4, 2, 16, 4, 1, 0, 0, 0, 1); } diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h index 466eda00a6..5f62efb59e 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmDefaultConfigReshapedValhall.h @@ -45,14 +45,20 @@ public: ClGemmDefaultConfigReshapedValhall(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h index 1c32f1358b..83928b3f4f 100644 --- a/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped/ClGemmReshapedKernelConfig.h @@ -50,7 +50,7 @@ public: */ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp index 9c23d9c998..c4825bfbeb 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.cpp @@ -29,7 +29,9 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" + #include "src/gpu/cl/kernels/gemm/ClGemmHelpers.h" + #include <utility> namespace arm_compute @@ -47,33 +49,39 @@ ClGemmDefaultConfigReshapedRhsOnlyBifrost::ClGemmDefaultConfigReshapedRhsOnlyBif { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); - - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x(&ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, - &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedRhsOnlyBifrost::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G51( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G52( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G31( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G76( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8); + + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G7x( + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16, + &ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G76: func = configs_G76.get_function(data_type); @@ -96,14 +104,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n <= 2548) + if (n <= 2548) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, false, true, false, true, false); } @@ -118,12 +127,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G31_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); @@ -131,7 +141,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); - if(m >= 28) + if (m >= 28) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, 0, 1, 0, 1); } @@ -142,7 +152,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); @@ -154,9 +165,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn const bool is_workload_big = ((m * n * b) / 16) >= 2048; - if(m == 1) + if (m == 1) { - if(n >= 8192) + if (n >= 8192) { const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 8, 1, h0, false, true, false, true, false); @@ -164,7 +175,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const unsigned int h0 = std::max(n / 2, 1U); - if(n <= 204) + if (n <= 204) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true, false); } @@ -177,25 +188,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1)); - if(is_workload_big) + if (is_workload_big) { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, true); } else { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true); } } // Get lhs_info/rhs_info in case of OpenCL image const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(16)), static_cast<int>(1)); - if(is_workload_big) + if (is_workload_big) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, h0, false, true, false, false, true); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 2, 4, 8, 1, h0, false, true, false, true, true); } const TensorInfo tensor_rhs_info(TensorShape(n, k, b), 1, DataType::F32); @@ -205,7 +220,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn // In case of vector by matrix or small workloads, we use the OpenCL buffer rather than the OpenCL image2d const bool use_cl_image2d = ((m == 1) || ((((m * n * b) / 16) < 2048) && n < 128)) ? false : true; - if(bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) + if (bool(validate_image2d_support_on_rhs(tensor_reshaped_info, rhs_info_img)) && use_cl_image2d) { return std::make_pair(lhs_info_img, rhs_info_img); } @@ -215,7 +230,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_nk = static_cast<float>(n) / static_cast<float>(k); @@ -225,46 +241,49 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(m == 1) + if (m == 1) { - if(r_nk <= 0.4664f) + if (r_nk <= 0.4664f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 16, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, false, true, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(workload <= 274.4000f) + if (workload <= 274.4000f) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 16, false, false, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 2, false, false, false, true, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int n0 = n < 1280 ? 2 : 4; const unsigned int h0 = std::max(n / n0, 1U); @@ -276,14 +295,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { - if(n > 2048) + if (n > 2048) { const unsigned int h0 = std::max(n / 4, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, false, true, false, true); @@ -300,7 +320,8 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G52_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; @@ -312,57 +333,59 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - if(m == 1) + if (m == 1) { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, false); - if(r_mk <= 0.0026f) + if (r_mk <= 0.0026f) { - if(r_nk <= 0.4664f) + if (r_nk <= 0.4664f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } else { - if(r_mk <= 0.0148f) + if (r_mk <= 0.0148f) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 1, 4, 16, 1, 16, false, true, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } else { - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 8, 4, 1, 2, false, false, false, false, false); - if(workload <= 362.6000f) + if (workload <= 362.6000f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); } else { - if(r_mn <= 22.6067f) + if (r_mn <= 22.6067f) { - if(workload <= 708.8000f) + if (workload <= 708.8000f) { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -371,27 +394,28 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_nk <= 0.0917f) + if (r_nk <= 0.0917f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 16, false, false, false, true, false); } else { - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, false, false, false, true); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); - if(m == 1) + if (m == 1) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 32, false, true, false, true, false); } @@ -400,15 +424,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(workload <= 7449.60f) + if (workload <= 7449.60f) { - if(workload <= 691.60f) + if (workload <= 691.60f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 8, false, false, false, false, false); } else { - if(workload <= 4155.20f) + if (workload <= 4155.20f) { return configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); } @@ -420,21 +444,22 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 16300.80f) + if (workload <= 16300.80f) { - if(r_mn <= 44.56f) + if (r_mn <= 44.56f) { GEMMLHSMatrixInfo lhs_info_buf; GEMMRHSMatrixInfo rhs_info_buf; GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 8, 4, 4, 1, 1, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } else { @@ -448,23 +473,25 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); - std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); + std::tie(lhs_info_img, rhs_info_img) = + configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 2, false, true, false, false, true); + std::tie(lhs_info_buf, rhs_info_buf) = + configure_lhs_rhs_info(m, n, 5, 2, 8, 1, 16, false, false, false, false, false); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F16); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F16); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int n0 = n < 1280 ? 2 : 4; const unsigned int h0 = std::max(n / n0, 1U); @@ -476,14 +503,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G7x_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(dot8_supported(CLKernelLibrary::get().get_device())) + if (dot8_supported(CLKernelLibrary::get().get_device())) { - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); @@ -497,7 +525,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 2), static_cast<int>(128)), static_cast<int>(1)); - if(m == 1) + if (m == 1) { return configure_lhs_rhs_info(m, n, 1, 2, 4, 1, h0, false, true, false, true); } @@ -508,12 +536,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G76_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, h0, false, true, false, true); @@ -524,12 +553,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyBifrost::configure_G51_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, false, true, false, true); diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h index 321cbb5250..77c0c8d500 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyBifrost.h @@ -45,21 +45,34 @@ public: ClGemmDefaultConfigReshapedRhsOnlyBifrost(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G51_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G52_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G51_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G7x_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G76_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G51_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G31_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp index d08bf84c72..da3e2ec912 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.cpp @@ -50,30 +50,35 @@ ClGemmDefaultConfigReshapedRhsOnlyValhall::ClGemmDefaultConfigReshapedRhsOnlyVal { } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure( + unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) { - using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> (ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, - unsigned int b); + using ConfigurationFunctionExecutorPtr = std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ( + ClGemmDefaultConfigReshapedRhsOnlyValhall::*)(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G77( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G78( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G710( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); - CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715(&ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16, - &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); + CLGEMMConfigArray<ConfigurationFunctionExecutorPtr> configs_G715( + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16, + &ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8); ConfigurationFunctionExecutorPtr func = nullptr; - switch(_target) + switch (_target) { case GPUTarget::G78: func = configs_G78.get_function(data_type); @@ -96,29 +101,29 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn return (this->*func)(m, n, k, b); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - if(m == 1) + if (m == 1) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); - if(r_mk <= 0.0064484127797186375) + if (r_mk <= 0.0064484127797186375) { - if(r_mn <= 0.0028273810748942196) + if (r_mn <= 0.0028273810748942196) { GEMMLHSMatrixInfo lhs_info_buf; GEMMRHSMatrixInfo rhs_info_buf; GEMMLHSMatrixInfo lhs_info_img; GEMMRHSMatrixInfo rhs_info_img; - const unsigned int h0 = std::max(n / 4, 1U); + const unsigned int h0 = std::max(n / 4, 1U); std::tie(lhs_info_img, rhs_info_img) = configure_lhs_rhs_info(m, n, 1, 4, 8, 1, 16, 0, 1, 0, 0, 1); std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 1, 4, 4, 1, h0, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } else { @@ -127,7 +132,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 0.020312500186264515) + if (r_mk <= 0.020312500186264515) { return configure_lhs_rhs_info(m, n, 1, 2, 16, 1, 4, 0, 1, 0, 0, 0); } @@ -143,9 +148,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_mk = static_cast<float>(m) / static_cast<float>(k); - if(workload <= 1999.2000122070312) + if (workload <= 1999.2000122070312) { - if(workload <= 747.1999816894531) + if (workload <= 747.1999816894531) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); } @@ -159,15 +164,14 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else { - if(r_mn <= 0.03348214365541935) + if (r_mn <= 0.03348214365541935) { - if(r_mk <= 0.028125000186264515) + if (r_mk <= 0.028125000186264515) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); } @@ -181,8 +185,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 8, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } else @@ -195,168 +198,112 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info_buf, rhs_info_buf) = configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 1, 0, 1, 0); return select_lhs_rhs_info(std::make_pair(lhs_info_img, rhs_info_img), - std::make_pair(lhs_info_buf, rhs_info_buf), - n, k, b, DataType::F32); + std::make_pair(lhs_info_buf, rhs_info_buf), n, k, b, DataType::F32); } } } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - const GeMMConfigsMatrix configs_1nkb_best = - { - { 1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0 }, - { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0 }, - { 1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_small_best = - { - { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 }, - { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_small_fallback = - { - { 102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 }, - { 16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 }, - { 369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0 }, - { 369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 }, - { 29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = - { - { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 }, - { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1 }, - { 49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - }; - - const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = - { - { 24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0 }, - { 49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0 }, + const GeMMConfigsMatrix configs_1nkb_best = { + {1, 8984, 640, 1, 1, 8, 8, 1, 0, 1, 1, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 4, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 5304, 640, 1, 1, 4, 4, 1, 0, 1, 0, 1, 1, 0}, {1, 1352, 1520, 1, 1, 2, 8, 1, 0, 1, 1, 1, 1, 0}, + {1, 4096, 25088, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1}, + {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 1, 1}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 1}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_fallback = {{102400, 4, 96, 1, 2, 2, 16, 1, 4, 1, 1, 1, 1, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}, + {16384, 4, 128, 1, 2, 2, 16, 1, 2, 1, 1, 1, 1, 0}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = { + {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, + {369664, 32, 28, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {12604, 60, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0}, + {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = { + {25584, 88, 16, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 0, 0}, + {369664, 32, 28, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {8944, 32, 776, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, + {16544, 104, 160, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, {12604, 60, 160, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0}, + {29584, 32, 28, 1, 4, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 128, 1, 1, 1, 0, 0}, + {2688, 136, 1492, 1, 8, 4, 4, 1, 128, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = { + {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0}, + {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 1}, + {49, 1024, 1024, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, }; - const GeMMConfigsMatrix configs_mnkb_squared_best = - { - { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1 }, - { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 } + const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = { + {24, 488, 88, 1, 2, 4, 16, 1, 4, 1, 1, 1, 0, 0}, + {49, 1024, 512, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {49, 1024, 1024, 1, 4, 4, 8, 1, 256, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_squared_fallback = - { - { 72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0 }, - { 24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_best_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_fallback_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_squared_best = { + {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 64, 1, 1, 1, 0, 1}, + {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_squared_fallback = { + {72, 92, 136, 1, 2, 2, 8, 1, 128, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 256, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, {1000, 152, 304, 1, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 4, 4, 1, 256, 1, 1, 1, 0, 0}, + {24, 88, 236, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 64, 1, 1, 1, 1, 0}}; + + const GeMMConfigsMatrix configs_mnkb_best_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 1}, + {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_fallback_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {688, 92, 68, 32, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 4, 4, 8, 1, 128, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 64, 1, 1, 1, 0, 0}}; const GeMMConfigsMatrix *configs_best_to_use = nullptr; const GeMMConfigsMatrix *configs_fallback_to_use = nullptr; - if(b == 1) + if (b == 1) { constexpr float ratio_m_gt_n = 10.f; constexpr float ratio_n_gt_m = 0.1f; constexpr unsigned int n_small_thr = 4; const float ratio = static_cast<float>(m) / static_cast<float>(n); - if(m == 1) + if (m == 1) { // We do not need fallback in this case, as we never use cl_image for the rhs tensor configs_best_to_use = &configs_1nkb_best; configs_fallback_to_use = &configs_1nkb_best; } - else if(n <= n_small_thr && ratio > ratio_m_gt_n) + else if (n <= n_small_thr && ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_n_small_best; configs_fallback_to_use = &configs_mnkb_n_small_fallback; } - else if(ratio > ratio_m_gt_n) + else if (ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_m_gt_n_best; configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback; } - else if(ratio < ratio_n_gt_m) + else if (ratio < ratio_n_gt_m) { configs_best_to_use = &configs_mnkb_n_gt_m_best; configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback; @@ -381,17 +328,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b); std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b); - return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), - std::make_pair(lhs_info1, rhs_info1), - n, k, b, DataType::F16); + return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b, + DataType::F16); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G77_u8( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { ARM_COMPUTE_UNUSED(k); ARM_COMPUTE_UNUSED(b); - if(m == 1) + if (m == 1) { const unsigned int h0 = std::max(n / 2, 1U); return configure_lhs_rhs_info(m, n, 1, 4, 16, 1, h0, 0, 1, 0, 1); @@ -399,7 +346,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn else { const int h0 = std::max(std::min(static_cast<int>(n / 4), static_cast<int>(256)), static_cast<int>(1)); - if(m >= 28) + if (m >= 28) { return configure_lhs_rhs_info(m, n, 4, 4, 16, 1, h0, 0, 1, 0, 1); } @@ -410,30 +357,31 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); const float r_nk = static_cast<float>(n) / static_cast<float>(k); const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; - if(m == 1) + if (m == 1) { - if(workload <= 278.7000f) + if (workload <= 278.7000f) { - if(workload <= 7.5000f) + if (workload <= 7.5000f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } else { - if(r_mn <= 0.0031f) + if (r_mn <= 0.0031f) { - if(workload <= 256.6000f) + if (workload <= 256.6000f) { - if(workload <= 16.7500f) + if (workload <= 16.7500f) { - if(r_nk <= 1.6671f) + if (r_nk <= 1.6671f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } @@ -454,15 +402,15 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 0.0027f) + if (r_mk <= 0.0027f) { - if(r_mk <= 0.0014f) + if (r_mk <= 0.0014f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } else { - if(workload <= 8.9500f) + if (workload <= 8.9500f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } @@ -474,13 +422,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 14.1500f) + if (workload <= 14.1500f) { return configure_lhs_rhs_info(m, n, 1, 2, 8, 1, 2, 0, 1, 1, 0, 0); } else { - if(r_mk <= 0.0041f) + if (r_mk <= 0.0041f) { return configure_lhs_rhs_info(m, n, 1, 2, 2, 1, 32, 0, 0, 0, 1, 0); } @@ -495,9 +443,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 363.7000f) + if (workload <= 363.7000f) { - if(r_mk <= 0.0031f) + if (r_mk <= 0.0031f) { return configure_lhs_rhs_info(m, n, 1, 4, 2, 1, 32, 0, 1, 0, 1, 0); } @@ -514,9 +462,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 1384.8000f) + if (workload <= 1384.8000f) { - if(workload <= 704.0000f) + if (workload <= 704.0000f) { return configure_lhs_rhs_info(m, n, 2, 2, 4, 1, 32, 0, 1, 0, 1, 0); } @@ -527,9 +475,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 16761.6006f) + if (workload <= 16761.6006f) { - if(r_mn <= 187.1250f) + if (r_mn <= 187.1250f) { return configure_lhs_rhs_info(m, n, 4, 4, 4, 1, 16, 0, 0, 0, 1, 1); } @@ -540,7 +488,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 432.4630f) + if (r_mk <= 432.4630f) { return configure_lhs_rhs_info(m, n, 5, 4, 4, 1, 16, 0, 0, 0, 1, 1); } @@ -553,42 +501,37 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G78_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { const float workload = (static_cast<float>(m) * static_cast<float>(n) * static_cast<float>(b)) / 20.0f; const float r_mn = static_cast<float>(m) / static_cast<float>(n); const float r_mk = static_cast<float>(m) / static_cast<float>(k); const float r_nk = static_cast<float>(n) / static_cast<float>(k); - if(m == 1) + if (m == 1) { - const GeMMConfigsMatrix configs_mnkb_best = - { - { 1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_best = { + {1, 8984, 640, 1, 1, 4, 2, 1, 0, 1, 0, 1, 1, 0}, {1, 420, 392, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 2, 1, 0, 1, 0, 1, 1, 0}, + {1, 5304, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 4096, 25088, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 732, 8988, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}}; return find_lhs_rhs_info(configs_mnkb_best, m, n, k, b); } else { - if(workload <= 1384.8000f) + if (workload <= 1384.8000f) { - if(r_nk <= 0.8333f) + if (r_nk <= 0.8333f) { - if(r_mk <= 0.9119f) + if (r_mk <= 0.9119f) { return configure_lhs_rhs_info(m, n, 2, 2, 16, 1, 4, 0, 1, 0, 1, 1); } else { - if(r_nk <= 0.1181f) + if (r_nk <= 0.1181f) { return configure_lhs_rhs_info(m, n, 2, 2, 8, 1, 32, 0, 0, 1, 0, 0); } @@ -600,7 +543,7 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_mk <= 1.0013f) + if (r_mk <= 1.0013f) { return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 32, 0, 1, 1, 0, 1); } @@ -612,11 +555,11 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(workload <= 11404.7998f) + if (workload <= 11404.7998f) { - if(r_mk <= 2.2884f) + if (r_mk <= 2.2884f) { - if(r_nk <= 0.9286f) + if (r_nk <= 0.9286f) { return configure_lhs_rhs_info(m, n, 4, 4, 8, 1, 4, 0, 1, 1, 0, 1); } @@ -632,9 +575,9 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } else { - if(r_nk <= 1.1926f) + if (r_nk <= 1.1926f) { - if(r_mn <= 1385.7917f) + if (r_mn <= 1385.7917f) { return configure_lhs_rhs_info(m, n, 6, 4, 8, 1, 4, 0, 1, 1, 0, 1); } @@ -652,12 +595,13 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f32( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { unsigned int best_m0; unsigned int best_n0; - if(is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) + if (is_mmul_kernel_preferred(m, n, k, b, DataType::F32, best_m0, best_n0)) { return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true); } @@ -667,153 +611,101 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn } } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G710_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { - const GeMMConfigsMatrix configs_1nkb_best = - { - { 1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0 }, - { 1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0 }, - { 1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0 } + const GeMMConfigsMatrix configs_1nkb_best = { + {1, 8984, 640, 1, 1, 2, 2, 1, 0, 1, 0, 1, 0, 0}, {1, 420, 392, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, + {1, 644, 5288, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}, {1, 6512, 6404, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 5304, 640, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, {1, 1352, 1520, 1, 1, 2, 4, 1, 0, 1, 0, 1, 0, 0}, + {1, 4096, 25088, 1, 1, 2, 8, 1, 0, 1, 0, 1, 1, 0}, {1, 732, 8988, 1, 1, 2, 8, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_n_small_best = {{102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}, + {16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0}}; + + const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = { + {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1}, + {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_n_small_best = - { - { 102400, 4, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 102400, 2, 96, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 16384, 4, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 }, - { 16384, 2, 128, 1, 1, 2, 16, 1, 0, 1, 0, 1, 0, 0 } + const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = { + {25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}, + {369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0}, + {8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, {2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, + {50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0}, {16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, + {12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0}, {3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0}, + {29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0}, }; - const GeMMConfigsMatrix configs_mnkb_m_gt_n_best = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 16, 1, 8, 1, 1, 1, 0, 1 }, - { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 8944, 32, 776, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 2688, 136, 1492, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }, - { 16544, 104, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 12604, 60, 160, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 3728, 96, 196, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}}; - const GeMMConfigsMatrix configs_mnkb_m_gt_n_fallback = - { - { 25584, 88, 16, 1, 4, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 25584, 16, 68, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 369664, 32, 28, 1, 2, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 65792, 44, 24, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 23036, 56, 736, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 90968, 40, 600, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 0 }, - { 8944, 32, 776, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - { 2688, 136, 1492, 1, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 50176, 64, 300, 1, 4, 8, 4, 1, 128, 1, 1, 1, 0, 0 }, - { 16544, 104, 160, 1, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12604, 60, 160, 1, 2, 8, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 3728, 96, 196, 1, 2, 8, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 29584, 32, 28, 1, 2, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 12544, 32, 27, 1, 2, 8, 8, 1, 16, 1, 1, 1, 0, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = {{24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0}, + {49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0}}; - const GeMMConfigsMatrix configs_mnkb_n_gt_m_best = - { - { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 } + const GeMMConfigsMatrix configs_mnkb_squared_best = { + {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, + {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1}, }; - const GeMMConfigsMatrix configs_mnkb_n_gt_m_fallback = - { - { 24, 488, 88, 1, 2, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 512, 1, 2, 4, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 49, 1024, 1024, 1, 2, 4, 8, 1, 4, 1, 1, 1, 1, 0 } - }; - - const GeMMConfigsMatrix configs_mnkb_squared_best = - { - { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 180, 420, 952, 1, 4, 4, 8, 1, 16, 1, 1, 1, 0, 1 }, - { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 196, 512, 512, 1, 5, 2, 8, 1, 4, 1, 1, 1, 1, 1 }, + const GeMMConfigsMatrix configs_mnkb_squared_fallback = { + {24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, {24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0}, + {72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0}, {268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, + {180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0}, {1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0}, {196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0}, }; - const GeMMConfigsMatrix configs_mnkb_squared_fallback = - { - { 24, 88, 236, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 24, 88, 488, 1, 2, 2, 8, 1, 4, 1, 1, 1, 1, 0 }, - { 72, 92, 136, 1, 2, 2, 8, 1, 32, 1, 1, 1, 1, 0 }, - { 268, 824, 5076, 1, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 180, 420, 952, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - { 1000, 152, 304, 1, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 272, 400, 2116, 1, 2, 8, 4, 1, 4, 1, 1, 1, 0, 0 }, - { 196, 512, 512, 1, 5, 2, 8, 1, 8, 1, 1, 1, 1, 0 }, - }; + const GeMMConfigsMatrix configs_mnkb_best_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1}, {4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1}, {24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, {5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1}, + {1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1}}; - const GeMMConfigsMatrix configs_mnkb_best_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 1 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 1 }, - { 24, 464, 412, 24, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 112, 184, 144, 28, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 5776, 64, 32, 36, 4, 4, 8, 1, 4, 1, 1, 1, 0, 1 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 1 } - }; - - const GeMMConfigsMatrix configs_mnkb_fallback_batched = - { - { 3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 }, - { 4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0 }, - { 688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0 }, - { 112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0 }, - { 5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0 }, - { 1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0 }, - { 2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0 } - }; + const GeMMConfigsMatrix configs_mnkb_fallback_batched = { + {3136, 64, 64, 36, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}, {4096, 48, 32, 36, 4, 4, 8, 1, 64, 1, 1, 1, 0, 0}, + {688, 92, 68, 32, 4, 8, 4, 1, 32, 1, 1, 1, 0, 0}, {24, 464, 412, 24, 2, 8, 4, 1, 32, 1, 1, 1, 0, 0}, + {112, 184, 144, 28, 4, 4, 8, 1, 8, 1, 1, 1, 0, 0}, {5776, 64, 32, 36, 2, 8, 8, 1, 32, 1, 1, 1, 0, 0}, + {1568, 64, 40, 36, 4, 8, 4, 1, 16, 1, 1, 1, 0, 0}, {2920, 64, 64, 24, 4, 8, 4, 1, 8, 1, 1, 1, 0, 0}}; const GeMMConfigsMatrix *configs_best_to_use = nullptr; const GeMMConfigsMatrix *configs_fallback_to_use = nullptr; - if(b == 1) + if (b == 1) { constexpr float ratio_m_gt_n = 10.f; constexpr float ratio_n_gt_m = 0.1f; constexpr unsigned int n_small_thr = 4; const float ratio = static_cast<float>(m) / static_cast<float>(n); - if(m == 1) + if (m == 1) { // We do not need fallback in this case, as we never use cl_image for the rhs tensor configs_best_to_use = &configs_1nkb_best; configs_fallback_to_use = &configs_1nkb_best; } - else if(n <= n_small_thr && ratio > ratio_m_gt_n) + else if (n <= n_small_thr && ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_n_small_best; configs_fallback_to_use = &configs_mnkb_n_small_best; } - else if(ratio > ratio_m_gt_n) + else if (ratio > ratio_m_gt_n) { configs_best_to_use = &configs_mnkb_m_gt_n_best; configs_fallback_to_use = &configs_mnkb_m_gt_n_fallback; } - else if(ratio < ratio_n_gt_m) + else if (ratio < ratio_n_gt_m) { configs_best_to_use = &configs_mnkb_n_gt_m_best; configs_fallback_to_use = &configs_mnkb_n_gt_m_fallback; @@ -838,17 +730,17 @@ std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOn std::tie(lhs_info0, rhs_info0) = find_lhs_rhs_info(*configs_best_to_use, m, n, k, b); std::tie(lhs_info1, rhs_info1) = find_lhs_rhs_info(*configs_fallback_to_use, m, n, k, b); - return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), - std::make_pair(lhs_info1, rhs_info1), - n, k, b, DataType::F16); + return select_lhs_rhs_info(std::make_pair(lhs_info0, rhs_info0), std::make_pair(lhs_info1, rhs_info1), n, k, b, + DataType::F16); } -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> ClGemmDefaultConfigReshapedRhsOnlyValhall::configure_G715_f16( + unsigned int m, unsigned int n, unsigned int k, unsigned int b) { unsigned int best_m0; unsigned int best_n0; - if(is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) + if (is_mmul_kernel_preferred(m, n, k, b, DataType::F16, best_m0, best_n0)) { return configure_lhs_rhs_info(m, n, best_m0, best_n0, 1, 1, 4, false, true, false, false, true); } diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h index f2952a3d30..a0ea337eb1 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmDefaultConfigReshapedRhsOnlyValhall.h @@ -45,17 +45,26 @@ public: ClGemmDefaultConfigReshapedRhsOnlyValhall(GPUTarget gpu); // Inherited overridden method - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure(unsigned int m, unsigned int n, unsigned int k, unsigned int b, DataType data_type) override; private: - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); - std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G78_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G77_u8(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G710_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G715_f32(unsigned int m, unsigned int n, unsigned int k, unsigned int b); + std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> + configure_G715_f16(unsigned int m, unsigned int n, unsigned int k, unsigned int b); }; } // namespace gemm } // namespace kernels diff --git a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h index 1503e74eb6..e07ad993ed 100644 --- a/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h +++ b/src/gpu/cl/kernels/gemm/reshaped_only_rhs/ClGemmReshapedOnlyRhsKernelConfig.h @@ -50,7 +50,7 @@ public: */ static std::unique_ptr<IClGemmKernelConfig> create(GPUTarget gpu) { - switch(get_arch_from_target(gpu)) + switch (get_arch_from_target(gpu)) { case GPUTarget::MIDGARD: case GPUTarget::BIFROST: diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp index 2407c6ca5e..689a743fdf 100644 --- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp +++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.cpp @@ -36,7 +36,9 @@ namespace opencl { namespace kernels { -Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info) +Status validate_matmul_input_shapes(const TensorShape &lhs_shape, + const TensorShape &rhs_shape, + const MatMulKernelInfo &matmul_kernel_info) { const size_t lhs_k = matmul_kernel_info.adj_lhs ? lhs_shape.y() : lhs_shape.x(); const size_t rhs_k = matmul_kernel_info.adj_rhs ? rhs_shape.x() : rhs_shape.y(); @@ -46,7 +48,7 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh ARM_COMPUTE_RETURN_ERROR_ON_MSG(rhs_shape.total_size() == 0, "Rhs tensor can't be empty"); constexpr size_t batch_dim_start = 2; - for(size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i) + for (size_t i = batch_dim_start; i < Coordinates::num_max_dimensions; ++i) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(lhs_shape[i] != rhs_shape[i], "Batch dimension broadcasting is not supported"); } @@ -54,9 +56,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh return Status{}; } -std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, - const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, - int mmul_m0, int mmul_n0) +std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, + int mmul_m0, + int mmul_n0) { ARM_COMPUTE_UNUSED(lhs, rhs); diff --git a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h index 210f22b109..c2ae2a67f4 100644 --- a/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h +++ b/src/gpu/cl/kernels/helpers/MatMulKernelHelpers.h @@ -44,7 +44,8 @@ namespace kernels * * @return true if the shapes and matmul kernel info matches */ -Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorShape &rhs_shape, +Status validate_matmul_input_shapes(const TensorShape &lhs_shape, + const TensorShape &rhs_shape, const MatMulKernelInfo &matmul_kernel_info); /** Validate and configure window for Matmul MMUL kernels @@ -58,9 +59,12 @@ Status validate_matmul_input_shapes(const TensorShape &lhs_shape, const TensorSh * * @return a pair of Status and Window object */ -std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, - const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulKernelInfo &matmul_kernel_info, - int mmul_m0, int mmul_n0); +std::pair<Status, Window> validate_and_configure_window_for_mmul_kernels(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulKernelInfo &matmul_kernel_info, + int mmul_m0, + int mmul_n0); } // namespace kernels } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClActivation.cpp b/src/gpu/cl/operators/ClActivation.cpp index 74a818d738..66877ebcec 100644 --- a/src/gpu/cl/operators/ClActivation.cpp +++ b/src/gpu/cl/operators/ClActivation.cpp @@ -23,19 +23,21 @@ */ #include "src/gpu/cl/operators/ClActivation.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClActivationKernel.h" - #include "src/common/IOperator.h" #include "src/common/utils/LegacySupport.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/ClContext.h" +#include "src/gpu/cl/kernels/ClActivationKernel.h" namespace arm_compute { namespace opencl { -void ClActivation::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClActivation::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src, dst, act_info); auto k = std::make_unique<kernels::ClActivationKernel>(); @@ -53,13 +55,17 @@ namespace gpu { namespace opencl { -std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, const AclTensorDescriptor &dst, const AclActivationDescriptor &act, bool is_validate) +std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensorDescriptor &src, + const AclTensorDescriptor &dst, + const AclActivationDescriptor &act, + bool is_validate) { TensorInfo src_info = detail::convert_to_legacy_tensor_info(src); TensorInfo dst_info = detail::convert_to_legacy_tensor_info(dst); auto info = detail::convert_to_activation_info(act); - if(is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), &dst_info.set_is_resizable(false), info))) + if (is_validate && !bool(arm_compute::opencl::ClActivation::validate(&src_info.set_is_resizable(false), + &dst_info.set_is_resizable(false), info))) { return std::make_tuple(nullptr, StatusCode::UnsupportedConfig); } @@ -68,7 +74,7 @@ std::tuple<IOperator *, StatusCode> ClContext::create_activation(const AclTensor act_op->configure(CLKernelLibrary::get().get_compile_context(), &src_info, &dst_info, info); auto op = new arm_compute::IOperator(static_cast<IContext *>(this)); - if(op == nullptr) + if (op == nullptr) { ARM_COMPUTE_LOG_ERROR_ACL("Couldn't allocate internal resources"); return std::make_tuple(nullptr, StatusCode::OutOfMemory); diff --git a/src/gpu/cl/operators/ClActivation.h b/src/gpu/cl/operators/ClActivation.h index 348dc27929..4f25bb5f24 100644 --- a/src/gpu/cl/operators/ClActivation.h +++ b/src/gpu/cl/operators/ClActivation.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ACTIVATION_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -43,7 +44,10 @@ public: * @param[out] dst Destination tensor info. Data type supported: same as @p src * @param[in] activation_info Activation layer parameters. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ActivationLayerInfo &activation_info); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ActivationLayerInfo &activation_info); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClActivation::configure() diff --git a/src/gpu/cl/operators/ClAdd.cpp b/src/gpu/cl/operators/ClAdd.cpp index b9bf505bba..b58d0df58d 100644 --- a/src/gpu/cl/operators/ClAdd.cpp +++ b/src/gpu/cl/operators/ClAdd.cpp @@ -23,17 +23,20 @@ */ #include "src/gpu/cl/operators/ClAdd.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void ClAdd::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); @@ -41,8 +44,11 @@ void ClAdd::configure(const ClCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClAdd::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status ClAdd::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::ADD, src1, src2, dst, policy, act_info); } diff --git a/src/gpu/cl/operators/ClAdd.h b/src/gpu/cl/operators/ClAdd.h index a17ce7b5d6..7aed902f5d 100644 --- a/src/gpu/cl/operators/ClAdd.h +++ b/src/gpu/cl/operators/ClAdd.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ADD_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -65,7 +66,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -73,7 +78,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl diff --git a/src/gpu/cl/operators/ClCast.cpp b/src/gpu/cl/operators/ClCast.cpp index 05ea21b734..8f26ef003d 100644 --- a/src/gpu/cl/operators/ClCast.cpp +++ b/src/gpu/cl/operators/ClCast.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClCast.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCastKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClCast::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy) +void ClCast::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + ConvertPolicy policy) { ARM_COMPUTE_LOG_PARAMS(src, dst, policy); auto k = std::make_unique<kernels::ClCastKernel>(); diff --git a/src/gpu/cl/operators/ClCast.h b/src/gpu/cl/operators/ClCast.h index 1b67ff7c8e..25d2293673 100644 --- a/src/gpu/cl/operators/ClCast.h +++ b/src/gpu/cl/operators/ClCast.h @@ -58,7 +58,8 @@ public: * @param[out] dst The destinatio tensor. Data types supported: U8/S8/U16/S16/U32/S32/F16/F32. * @param[in] policy Conversion policy. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); + void + configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, ConvertPolicy policy); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCast::configure() diff --git a/src/gpu/cl/operators/ClConcatenate.cpp b/src/gpu/cl/operators/ClConcatenate.cpp index a27fc37cc4..31018b9768 100644 --- a/src/gpu/cl/operators/ClConcatenate.cpp +++ b/src/gpu/cl/operators/ClConcatenate.cpp @@ -23,9 +23,14 @@ */ #include "src/gpu/cl/operators/ClConcatenate.h" +#include "arm_compute/core/Error.h" +#include "arm_compute/core/TensorInfo.h" +#include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" +#include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/ClBatchConcatenateKernel.h" #include "src/gpu/cl/kernels/ClDepthConcatenateKernel.h" #include "src/gpu/cl/kernels/ClHeightConcatenateKernel.h" @@ -33,42 +38,39 @@ #include "src/gpu/cl/kernels/ClWidthConcatenate4TensorsKernel.h" #include "src/gpu/cl/kernels/ClWidthConcatenateKernel.h" -#include "arm_compute/core/Error.h" -#include "arm_compute/core/TensorInfo.h" -#include "arm_compute/core/Types.h" - -#include "src/common/utils/Log.h" -#include "src/core/helpers/AutoConfiguration.h" - namespace arm_compute { namespace opencl { -void ClConcatenate::configure(const CLCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis) +void ClConcatenate::configure(const CLCompileContext &compile_context, + const std::vector<ITensorInfo *> &src_vector, + ITensorInfo *dst, + size_t axis) { ARM_COMPUTE_ERROR_ON(dst == nullptr); ARM_COMPUTE_LOG_PARAMS(src_vector, dst, axis); _axis = axis; _num_inputs = src_vector.size(); - TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); + TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, _axis); std::vector<const ITensorInfo *> const_src_vector(src_vector.size()); - std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), [](ITensorInfo * t) - { - ARM_COMPUTE_ERROR_ON_NULLPTR(t); - return t; - }); + std::transform(src_vector.begin(), src_vector.end(), const_src_vector.begin(), + [](ITensorInfo *t) + { + ARM_COMPUTE_ERROR_ON_NULLPTR(t); + return t; + }); // dst auto inizialitation if not yet initialized auto_init_if_empty(*dst, dst_shape, 1, src_vector[0]->data_type()); ARM_COMPUTE_ERROR_THROW_ON(ClConcatenate::validate(const_src_vector, dst, axis)); unsigned int offset = 0; - switch(_axis) + switch (_axis) { case Window::DimX: { - switch(_num_inputs) + switch (_num_inputs) { case 2: { @@ -82,14 +84,15 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std { // Configure WidthConcatenate4Tensors kernel auto kernel = std::make_unique<kernels::ClWidthConcatenate4TensorsKernel>(); - kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), src_vector.at(3), dst); + kernel->configure(compile_context, src_vector.at(0), src_vector.at(1), src_vector.at(2), + src_vector.at(3), dst); _concat_kernels.emplace_back(std::move(kernel)); break; } default: { // Configure generic case WidthConcatenate kernels - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClWidthConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -103,7 +106,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case Window::DimY: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClHeightConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -114,7 +117,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case Window::DimZ: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClDepthConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -125,7 +128,7 @@ void ClConcatenate::configure(const CLCompileContext &compile_context, const std } case 3: { - for(unsigned int i = 0; i < _num_inputs; ++i) + for (unsigned int i = 0; i < _num_inputs; ++i) { auto kernel = std::make_unique<kernels::ClBatchConcatenateKernel>(); kernel->configure(compile_context, src_vector.at(i), offset, dst); @@ -148,25 +151,27 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto ARM_COMPUTE_RETURN_ERROR_ON(num_inputs < 2); unsigned int offset = 0; - switch(axis) + switch (axis) { case Window::DimX: { - switch(num_inputs) + switch (num_inputs) { case 2: // Validate WidthConcatenate2Tensors kernels if there are 2 inputs ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWidthConcatenate2TensorsKernel::validate(src_vector[0], src_vector[1], dst)); break; case 4: // Validate WidthConcatenate4Tensors kernels if there are 4 inputs ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src_vector[0], src_vector[1], src_vector[2], src_vector[3]); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate(src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenate4TensorsKernel::validate( + src_vector[0], src_vector[1], src_vector[2], src_vector[3], dst)); break; default: // Validate generic case of WidthConcatenate kernel - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWidthConcatenateKernel::validate(src, offset, dst)); @@ -178,7 +183,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto } case Window::DimY: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClHeightConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -187,7 +192,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto } case Window::DimZ: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDepthConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -196,7 +201,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto } case 3: { - for(const auto &src : src_vector) + for (const auto &src : src_vector) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClBatchConcatenateKernel::validate(src, offset, dst)); offset += src->dimension(axis); @@ -207,7 +212,7 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto ARM_COMPUTE_ERROR("Axis not supported"); } - if(dst->total_size() != 0) + if (dst->total_size() != 0) { TensorShape dst_shape = arm_compute::misc::shape_calculator::calculate_concatenate_shape(src_vector, axis); ARM_COMPUTE_RETURN_ERROR_ON(dst_shape.total_size() != dst->tensor_shape().total_size()); @@ -218,17 +223,17 @@ Status ClConcatenate::validate(const std::vector<const ITensorInfo *> &src_vecto void ClConcatenate::run(ITensorPack &tensors) { - if(tensors.empty()) + if (tensors.empty()) { ARM_COMPUTE_ERROR("No inputs provided"); } - if(static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs)) + if (static_cast<int>(tensors.size()) - 1 != static_cast<int>(_num_inputs)) { ARM_COMPUTE_ERROR("Configured with different number of inputs"); } - if(_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) + if (_axis == Window::DimX && (_num_inputs == 2 || _num_inputs == 4)) { ARM_COMPUTE_ERROR_ON(_concat_kernels.empty()); CLScheduler::get().enqueue_op(*_concat_kernels.at(0), tensors, true); @@ -236,7 +241,7 @@ void ClConcatenate::run(ITensorPack &tensors) else { int i = 0; - for(auto &k : _concat_kernels) + for (auto &k : _concat_kernels) { ITensorPack pack; pack.add_tensor(TensorType::ACL_SRC, tensors.get_const_tensor(ACL_SRC_VEC + i)); diff --git a/src/gpu/cl/operators/ClConcatenate.h b/src/gpu/cl/operators/ClConcatenate.h index de0cf84d2c..d8ce9d2a5c 100644 --- a/src/gpu/cl/operators/ClConcatenate.h +++ b/src/gpu/cl/operators/ClConcatenate.h @@ -57,7 +57,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src_vector. * @param[in] axis Concatenation axis. Supported underlying concatenation axis are 0, 1, 2 and 3. */ - void configure(const ClCompileContext &compile_context, const std::vector<ITensorInfo *> &src_vector, ITensorInfo *dst, size_t axis); + void configure(const ClCompileContext &compile_context, + const std::vector<ITensorInfo *> &src_vector, + ITensorInfo *dst, + size_t axis); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConcatenate::configure() @@ -71,8 +74,8 @@ public: private: std::vector<std::unique_ptr<IClKernel>> _concat_kernels{}; - unsigned int _num_inputs{ 0 }; - unsigned int _axis{ 0 }; + unsigned int _num_inputs{0}; + unsigned int _axis{0}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConv2d.cpp b/src/gpu/cl/operators/ClConv2d.cpp index eb9475ccaa..2c3b0214fa 100644 --- a/src/gpu/cl/operators/ClConv2d.cpp +++ b/src/gpu/cl/operators/ClConv2d.cpp @@ -23,17 +23,17 @@ */ #include "src/gpu/cl/operators/ClConv2d.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/CL/functions/CLFFTConvolutionLayer.h" + +#include "src/common/utils/Log.h" #include "src/gpu/cl/operators/ClDirectConv2d.h" #include "src/gpu/cl/operators/ClGemmConv2d.h" #include "src/gpu/cl/operators/ClIndirectConv2d.h" #include "src/gpu/cl/operators/ClWinogradConv2d.h" -#include "src/common/utils/Log.h" - #include <memory> namespace @@ -48,7 +48,7 @@ namespace */ size_t get_direct_conv_kernel_threshold_nhwc(arm_compute::GPUTarget gpu_target) { - switch(gpu_target) + switch (gpu_target) { case arm_compute::GPUTarget::G76: case arm_compute::GPUTarget::G77: @@ -71,27 +71,33 @@ namespace opencl { using namespace arm_compute::misc::shape_calculator; -ClConv2d::ClConv2d() - : _operator() +ClConv2d::ClConv2d() : _operator() { } ClConv2d::~ClConv2d() = default; -void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info) +void ClConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); + ARM_COMPUTE_ERROR_THROW_ON( + ClConv2d::validate(src, weights, ((biases != nullptr) ? biases : nullptr), dst, conv2d_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); - switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, CLScheduler::get().target())) { case ConvolutionMethod::WINOGRAD: { ARM_COMPUTE_ERROR_ON(conv2d_info.num_groups != 1); auto f = std::make_unique<ClWinogradConv2d>(); - f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math); + f->configure(compile_context, src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, + conv2d_info.enable_fast_math); _operator = std::move(f); break; } @@ -125,35 +131,46 @@ void ClConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *s _aux_mem = _operator->workspace(); } -Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, +Status ClConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); const GPUTarget gpu_target = CLScheduler::get().target(); - switch(ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) + switch (ClConv2d::get_convolution_method(src, weights, dst, conv2d_info, weights_info, gpu_target)) { case ConvolutionMethod::WINOGRAD: { //Validate Winograd - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info, conv2d_info.enable_fast_math)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClWinogradConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR(ClWinogradConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, + conv2d_info.act_info, conv2d_info.enable_fast_math)); break; } case ConvolutionMethod::DIRECT: { // Validate direct convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClDirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClDirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); break; } case ConvolutionMethod::INDIRECT: { // Validate indirect convolution layer - ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, "Grouping (num_groups != 1) with ClIndirectConv2d is not supported"); - ARM_COMPUTE_RETURN_ON_ERROR(ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG(conv2d_info.num_groups != 1, + "Grouping (num_groups != 1) with ClIndirectConv2d is not supported"); + ARM_COMPUTE_RETURN_ON_ERROR( + ClIndirectConv2d::validate(src, weights, biases, dst, conv2d_info.conv_info, conv2d_info.act_info)); break; } case ConvolutionMethod::GEMM: @@ -170,8 +187,12 @@ Status ClConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, co return Status{}; } -ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info, const GPUTarget gpu_target) +ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_ERROR_ON_NULLPTR(dst); @@ -191,20 +212,35 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const using ConvolutionConfiguration = std::tuple<Size2D, Size2D, Size2D, PadStrideInfo, DataLayout>; using ConfigurationMethod = std::pair<ConvolutionConfiguration, ConvolutionMethod>; - const std::vector<ConfigurationMethod> known_configs = - { + const std::vector<ConfigurationMethod> known_configs = { // Alexnet - ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + ConfigurationMethod(ConvolutionConfiguration(Size2D(27U, 27U), Size2D(5U, 5U), Size2D(48U, 128U), + PadStrideInfo(1U, 1U, 2U, 2U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), // VGG16 / VGG19 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), ConvolutionMethod::DIRECT), + ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 64U), + PadStrideInfo(1U, 1U, 1U, 1U), DataLayout::NCHW), + ConvolutionMethod::DIRECT), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NCHW), + ConvolutionMethod::GEMM), // Mobilenet 224 - ConfigurationMethod(ConvolutionConfiguration(Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(224U, 224U), Size2D(3U, 3U), Size2D(3U, 32U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), // Mobilenet 160 - ConfigurationMethod(ConvolutionConfiguration(Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), ConvolutionMethod::GEMM), + ConfigurationMethod(ConvolutionConfiguration( + Size2D(160U, 160U), Size2D(3U, 3U), Size2D(3U, 24U), + PadStrideInfo(2U, 2U, 0U, 1U, 0U, 1U, DimensionRoundingType::FLOOR), DataLayout::NHWC), + ConvolutionMethod::GEMM), }; const auto find_config = [&](ConfigurationMethod c) @@ -213,76 +249,89 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const const PadStrideInfo info = std::get<3>(config); const DataLayout data_layout = std::get<4>(config); - return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) - && std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() - && info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && info.stride() == conv_info.stride() && (data_layout == src->data_layout()); + return std::get<0>(config) == Size2D(src->dimension(idx_w), src->dimension(idx_h)) && + std::get<1>(config) == Size2D(weights->dimension(idx_w), weights->dimension(idx_h)) && + std::get<2>(config) == Size2D(weights->dimension(idx_c), weights->dimension(3)) && + info.pad_top() == conv_info.pad_top() && info.pad_right() == conv_info.pad_right() && + info.pad_bottom() == conv_info.pad_bottom() && info.pad_left() == conv_info.pad_left() && + info.stride() == conv_info.stride() && (data_layout == src->data_layout()); }; std::vector<ConfigurationMethod>::const_iterator found; - if((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) + if ((found = std::find_if(known_configs.begin(), known_configs.end(), find_config)) != known_configs.end()) { return (*found).second; } - if(dilation != Size2D(1U, 1U)) + if (dilation != Size2D(1U, 1U)) { return ConvolutionMethod::GEMM; } else { - if(src->data_layout() == DataLayout::NCHW) + if (src->data_layout() == DataLayout::NCHW) { // SRGAN - if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && + (ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info))) { return ConvolutionMethod::DIRECT; } - if((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) + if ((weights->dimension(idx_h) > 5) && (src->dimension(idx_c) > dst->dimension(idx_c)) && + (CLFFTConvolutionLayer::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math))) { return ConvolutionMethod::FFT; } - if(src->dimension(idx_c) < 16) + if (src->dimension(idx_c) < 16) { return ConvolutionMethod::GEMM; } - return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) ? ConvolutionMethod::WINOGRAD : ConvolutionMethod::GEMM; + return bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)) + ? ConvolutionMethod::WINOGRAD + : ConvolutionMethod::GEMM; } else { - const bool is_direct_valid = bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); - const bool is_wino_valid = bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); + const bool is_direct_valid = + bool(ClDirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_wino_valid = + bool(ClWinogradConv2d::validate(src, weights, nullptr, dst, conv_info, act_info, enable_fast_math)); const size_t kernel_sz_direct_conv_thr = get_direct_conv_kernel_threshold_nhwc(gpu_target); // SRGAN case - if((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && (conv_info.pad_top() < 3) - && is_direct_valid) + if ((src->dimension(idx_h) > 720U) && (dst->dimension(idx_h) > 720U) && (weights->dimension(idx_h) == 9) && + (conv_info.pad_top() < 3) && is_direct_valid) { return ConvolutionMethod::DIRECT; } // Floating-point case: GeMM/Direct/Winograd - if(is_data_type_float(src->data_type())) + if (is_data_type_float(src->data_type())) { // Get dst shape - TensorShape output_shape = misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); - const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); - const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8; - const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; - const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; - const bool is_ofm_lt_64 = weights->dimension(3U) < 64; - const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; - const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); - const bool is_m_one = output_shape[1] * output_shape[2] == 1; - const bool is_unit_stride = (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1); - const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h); + TensorShape output_shape = + misc::shape_calculator::compute_deep_convolution_shape(*src, *weights, conv_info); + const bool is_large_kernel_sz = (weights->dimension(idx_w) >= kernel_sz_direct_conv_thr) && + (weights->dimension(idx_h) >= kernel_sz_direct_conv_thr); + const bool is_ifm_ge_8 = src->dimension(idx_c) >= 8; + const bool is_ifm_ge_16 = src->dimension(idx_c) >= 16; + const bool is_ofm_lte_8 = weights->dimension(3U) <= 8; + const bool is_ofm_lt_64 = weights->dimension(3U) < 64; + const bool workload_gte_8192 = (output_shape[0] * output_shape[1] * output_shape[2]) / 16 >= 8192; + const bool is_ifm_gt_ofm = src->dimension(idx_c) > weights->dimension(3U); + const bool is_m_one = output_shape[1] * output_shape[2] == 1; + const bool is_unit_stride = + (conv2d_info.conv_info.stride().first == 1) && (conv2d_info.conv_info.stride().second == 1); + const int32_t kernel_sz = weights->dimension(idx_w) * weights->dimension(idx_h); // Run Winograd if valid and IFM >= 8 - if(is_wino_valid && is_ifm_ge_8) + if (is_wino_valid && is_ifm_ge_8) { - if(is_ofm_lte_8) + if (is_ofm_lte_8) { - if(gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD) + if (gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD) { return ConvolutionMethod::WINOGRAD; } @@ -294,18 +343,19 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const } // Direct convolution case - if(is_direct_valid) + if (is_direct_valid) { - if((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)) + if ((gpu_target == arm_compute::GPUTarget::G71 || gpu_target == arm_compute::GPUTarget::G72 || + get_arch_from_target(gpu_target) == arm_compute::GPUTarget::MIDGARD)) { - if(is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) + if (is_large_kernel_sz && is_ifm_ge_16 && is_ifm_gt_ofm) { return ConvolutionMethod::DIRECT; } } - else if(gpu_target == arm_compute::GPUTarget::G76) + else if (gpu_target == arm_compute::GPUTarget::G76) { - if((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) + if ((is_large_kernel_sz && workload_gte_8192 && is_ifm_ge_16) || (is_ofm_lte_8 && is_ifm_ge_16)) { return ConvolutionMethod::DIRECT; } @@ -314,21 +364,24 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const { ConvolutionMethod preferred_conv_method = ConvolutionMethod::DIRECT; - const bool is_indirect_valid = bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); + const bool is_indirect_valid = + bool(ClIndirectConv2d::validate(src, weights, nullptr, dst, conv_info, act_info)); // indirect conv2d should be called when: // 1- When the kernel size is greater than 1x1 and less than or equal to 9x9 (81) // 2- When the kernel size is odd // 3- When the Gpu target is Arm Mali-G77 - if(is_indirect_valid) + if (is_indirect_valid) { const bool is_kernel_sz_odd = kernel_sz % 2; const bool is_g77 = gpu_target == GPUTarget::G77; - preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 ? ConvolutionMethod::INDIRECT : ConvolutionMethod::DIRECT; + preferred_conv_method = (kernel_sz > 1) && (kernel_sz <= 81) && is_kernel_sz_odd && is_g77 + ? ConvolutionMethod::INDIRECT + : ConvolutionMethod::DIRECT; } // Direct/indirect convolution used for the first layer of the network - if(workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64) + if (workload_gte_8192 && !is_ifm_ge_16 && !is_unit_stride && is_ofm_lt_64) { // In general, the question we should ask for the first convolution layer of a model is: // when the execution time of im2col + gemm < direct?. Since im2col does not depend on the OFM, it means that @@ -337,13 +390,13 @@ ConvolutionMethod ClConv2d::get_convolution_method(const ITensorInfo *src, const return preferred_conv_method; } - if((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16) + if ((is_large_kernel_sz || is_m_one) && workload_gte_8192 && is_ifm_ge_16) { return preferred_conv_method; } // Direct convolution used for the last layer of the network - if(is_ofm_lte_8) + if (is_ofm_lte_8) { return preferred_conv_method; } diff --git a/src/gpu/cl/operators/ClConv2d.h b/src/gpu/cl/operators/ClConv2d.h index c6c366a762..0cf3cbc1ce 100644 --- a/src/gpu/cl/operators/ClConv2d.h +++ b/src/gpu/cl/operators/ClConv2d.h @@ -26,6 +26,7 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/IClOperator.h" @@ -112,15 +113,24 @@ public: * @param[in] conv2d_info Contains convolution 2d info described in @ref Conv2dInfo. * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. Data type supported: Same as @p src. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info = WeightsInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration of @ref ClConv2d * * Similar to ClConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will return the convolution called by @ref ClConv2d * @@ -137,11 +147,15 @@ public: * * @return the Convolution Method Hint */ - static ConvolutionMethod get_convolution_method(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info, const GPUTarget gpu_target); + static ConvolutionMethod get_convolution_method(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info, + const GPUTarget gpu_target); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp index 08122b6852..cf24c68d21 100644 --- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.cpp @@ -23,16 +23,19 @@ */ #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClConvertFullyConnectedWeightsKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { ARM_COMPUTE_LOG_PARAMS(src, dst, original_src_shape, data_layout); auto k = std::make_unique<kernels::ClConvertFullyConnectedWeightsKernel>(); @@ -40,9 +43,12 @@ void ClConvertFullyConnectedWeights::configure(const ClCompileContext &compile_c _kernel = std::move(k); } -Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout) +Status ClConvertFullyConnectedWeights::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout) { return kernels::ClConvertFullyConnectedWeightsKernel::validate(src, dst, original_src_shape, data_layout); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h index 2794eb17b0..c46152081c 100644 --- a/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h +++ b/src/gpu/cl/operators/ClConvertFullyConnectedWeights.h @@ -43,14 +43,21 @@ public: * @param[in] original_src_shape Shape of the original src tensor (the one entering fully connected layer). * @param[in] data_layout The data layout the weights have been trained in. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClConvertFullyConnectedWeights::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const TensorShape &original_src_shape, DataLayout data_layout); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const TensorShape &original_src_shape, + DataLayout data_layout); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCopy.cpp b/src/gpu/cl/operators/ClCopy.cpp index d3b83040d0..e2be7cebd4 100644 --- a/src/gpu/cl/operators/ClCopy.cpp +++ b/src/gpu/cl/operators/ClCopy.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClCopy.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCopyKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClCopy::validate(const ITensorInfo *src, const ITensorInfo *dst, Window * return kernels::ClCopyKernel::validate(src, dst, dst_window); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCopy.h b/src/gpu/cl/operators/ClCopy.h index 9b427f9675..fe9b58c607 100644 --- a/src/gpu/cl/operators/ClCopy.h +++ b/src/gpu/cl/operators/ClCopy.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_COPY_H #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -44,7 +45,10 @@ public: * @param[in] dst_window (Optional) Window to be used in case only copying into part of a tensor. Default is nullptr. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCopy::configure() diff --git a/src/gpu/cl/operators/ClCrop.cpp b/src/gpu/cl/operators/ClCrop.cpp index cef9f14c7d..6313e4fbb5 100644 --- a/src/gpu/cl/operators/ClCrop.cpp +++ b/src/gpu/cl/operators/ClCrop.cpp @@ -23,17 +23,22 @@ */ #include "src/gpu/cl/operators/ClCrop.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClCropKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, - Window *dst_window) +void ClCrop::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { ARM_COMPUTE_LOG_PARAMS(src, dst, start, end, batch_index, extrapolation_value, dst_window); auto k = std::make_unique<kernels::ClCropKernel>(); @@ -41,9 +46,15 @@ void ClCrop::configure(const ClCompileContext &compile_context, const ITensorInf _kernel = std::move(k); } -Status ClCrop::validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value, Window *dst_window) +Status ClCrop::validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value, + Window *dst_window) { return kernels::ClCropKernel::validate(src, dst, start, end, batch_index, extrapolation_value, dst_window); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClCrop.h b/src/gpu/cl/operators/ClCrop.h index 1cf1c9bff4..e845cf372c 100644 --- a/src/gpu/cl/operators/ClCrop.h +++ b/src/gpu/cl/operators/ClCrop.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_CROP_H #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -49,16 +50,27 @@ public: * @param[in] extrapolation_value Value to be used for values outside of the image. Default is 0. * @param[in] dst_window Output window to be used in case cropped image is being copied into a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClCrop::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, Coordinates2D start, Coordinates2D end, uint32_t batch_index, float extrapolation_value = 0, - Window *dst_window = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + Coordinates2D start, + Coordinates2D end, + uint32_t batch_index, + float extrapolation_value = 0, + Window *dst_window = nullptr); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDequantize.cpp b/src/gpu/cl/operators/ClDequantize.cpp index 0fccab63e0..eb6f9e7abb 100644 --- a/src/gpu/cl/operators/ClDequantize.cpp +++ b/src/gpu/cl/operators/ClDequantize.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClDequantizeKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClDequantizeKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClDirectConv2d.cpp b/src/gpu/cl/operators/ClDirectConv2d.cpp index 0215dba422..17a196ce6b 100644 --- a/src/gpu/cl/operators/ClDirectConv2d.cpp +++ b/src/gpu/cl/operators/ClDirectConv2d.cpp @@ -26,6 +26,8 @@ #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/gpu/cl/kernels/ClActivationKernel.h" @@ -35,8 +37,6 @@ #include "src/runtime/heuristics/direct_conv/ClDirectConvKernelConfig.h" #include "src/runtime/heuristics/direct_conv/IClDirectConvKernelConfig.h" -#include "src/common/utils/Log.h" - using namespace arm_compute::cl_direct_conv; namespace arm_compute @@ -53,7 +53,8 @@ ITensorPack select_activation_src_dst(ITensorPack &tensors) return pack; } -DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_direct_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); @@ -65,8 +66,13 @@ DirectConvComputeKernelInfo config_direct_convolution_nhwc(const ITensorInfo *sr } // namespace -void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void ClDirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); @@ -75,15 +81,17 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); // Configure direct convolution kernel - const ActivationLayerInfo conv2d_act_info = (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info : ActivationLayerInfo(); - auto k = std::make_unique<kernels::ClDirectConv2dKernel>(); + const ActivationLayerInfo conv2d_act_info = + (src->data_layout() == DataLayout::NHWC && is_data_type_float(src->data_type())) ? act_info + : ActivationLayerInfo(); + auto k = std::make_unique<kernels::ClDirectConv2dKernel>(); k->set_target(CLScheduler::get().target()); k->configure(compile_context, src, weights, biases, dst, conv_info, conv2d_act_info, desc); _direct_conv_kernel = std::move(k); // Configure border handler PixelValue zero_value(0.f); - if(is_data_type_quantized_asymmetric(src->data_type())) + if (is_data_type_quantized_asymmetric(src->data_type())) { zero_value = PixelValue(0, src->data_type(), src->quantization_info()); } @@ -92,7 +100,7 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI _src_border_handler = std::move(b); // Fused activation is currently supported for NHWC and floating point types - if(act_info.enabled() && !conv2d_act_info.enabled()) + if (act_info.enabled() && !conv2d_act_info.enabled()) { auto a = std::make_unique<kernels::ClActivationKernel>(); a->configure(compile_context, dst, dst, act_info); @@ -103,14 +111,19 @@ void ClDirectConv2d::configure(const CLCompileContext &compile_context, ITensorI CLScheduler::get().tune_kernel_static(*_direct_conv_kernel); } -Status ClDirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +Status ClDirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_direct_convolution_nhwc(src, weights, conv_info); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc)); - if(act_info.enabled()) + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClDirectConv2dKernel::validate(src, weights, biases, dst, conv_info, ActivationLayerInfo(), desc)); + if (act_info.enabled()) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, dst, act_info)); } @@ -124,7 +137,7 @@ void ClDirectConv2d::run(ITensorPack &tensors) // Run direct convolution CLScheduler::get().enqueue_op(*_direct_conv_kernel.get(), tensors, false); // Run activation kernel - if(_activation_kernel) + if (_activation_kernel) { auto act_pack = select_activation_src_dst(tensors); CLScheduler::get().enqueue_op(*_activation_kernel.get(), act_pack, false); diff --git a/src/gpu/cl/operators/ClDirectConv2d.h b/src/gpu/cl/operators/ClDirectConv2d.h index fedb9e971e..0f18490814 100644 --- a/src/gpu/cl/operators/ClDirectConv2d.h +++ b/src/gpu/cl/operators/ClDirectConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_DIRECT_CONV2D_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClKernel.h" #include "src/gpu/cl/IClOperator.h" @@ -59,7 +60,12 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -67,16 +73,20 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr<IClKernel> _direct_conv_kernel{ nullptr }; - std::unique_ptr<IClKernel> _src_border_handler{ nullptr }; - std::unique_ptr<IClKernel> _activation_kernel{ nullptr }; + std::unique_ptr<IClKernel> _direct_conv_kernel{nullptr}; + std::unique_ptr<IClKernel> _src_border_handler{nullptr}; + std::unique_ptr<IClKernel> _activation_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClDirectConv3d.cpp b/src/gpu/cl/operators/ClDirectConv3d.cpp index 5d37f07f31..b08347936b 100644 --- a/src/gpu/cl/operators/ClDirectConv3d.cpp +++ b/src/gpu/cl/operators/ClDirectConv3d.cpp @@ -24,13 +24,19 @@ #include "src/gpu/cl/operators/ClDirectConv3d.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/gpu/cl/kernels/ClDirectConv3dKernel.h" namespace arm_compute { namespace opencl { -void ClDirectConv3d::configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info) +void ClDirectConv3d::configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src0); @@ -40,7 +46,11 @@ void ClDirectConv3d::configure(const CLCompileContext &compile_context, const IT _direct_conv3d_kernel = std::move(k); } -Status ClDirectConv3d::validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info) +Status ClDirectConv3d::validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClDirectConv3dKernel::validate(src0, src1, src2, dst, conv3d_info)); return Status{}; diff --git a/src/gpu/cl/operators/ClDirectConv3d.h b/src/gpu/cl/operators/ClDirectConv3d.h index fa58b5aedd..5fb32460e2 100644 --- a/src/gpu/cl/operators/ClDirectConv3d.h +++ b/src/gpu/cl/operators/ClDirectConv3d.h @@ -67,7 +67,12 @@ public: * @param[in] conv3d_info Contains strides, padding, rounding, activation, dilation and fast math information. Activation and fast math are currently unused. * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, ITensorInfo *dst, const Conv3dInfo &conv3d_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + ITensorInfo *dst, + const Conv3dInfo &conv3d_info); /** Static function to check if given info will lead to a valid configuration * @@ -75,14 +80,18 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src0, const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const Conv3dInfo &conv3d_info); + static Status validate(const ITensorInfo *src0, + const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const Conv3dInfo &conv3d_info); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr<IClKernel> _direct_conv3d_kernel{ nullptr }; + std::unique_ptr<IClKernel> _direct_conv3d_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_DIRECT_CONV3D_H */ diff --git a/src/gpu/cl/operators/ClElementwiseOperations.cpp b/src/gpu/cl/operators/ClElementwiseOperations.cpp index 32d2b88798..1325371d19 100644 --- a/src/gpu/cl/operators/ClElementwiseOperations.cpp +++ b/src/gpu/cl/operators/ClElementwiseOperations.cpp @@ -23,15 +23,18 @@ */ #include "src/gpu/cl/operators/ClElementwiseOperations.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" namespace arm_compute { namespace opencl { -void ClElementwiseDivision::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseDivision::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -39,12 +42,19 @@ void ClElementwiseDivision::configure(const ClCompileContext &compile_context, I _kernel = std::move(k); } -Status ClElementwiseDivision::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseDivision::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::DIV, src1, src2, dst, act_info); } -void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseMax::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -52,12 +62,19 @@ void ClElementwiseMax::configure(const ClCompileContext &compile_context, ITenso _kernel = std::move(k); } -Status ClElementwiseMax::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseMax::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MAX, src1, src2, dst, act_info); } -void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseMin::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -65,12 +82,19 @@ void ClElementwiseMin::configure(const ClCompileContext &compile_context, ITenso _kernel = std::move(k); } -Status ClElementwiseMin::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseMin::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::MIN, src1, src2, dst, act_info); } -void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -78,12 +102,19 @@ void ClElementwiseSquaredDiff::configure(const ClCompileContext &compile_context _kernel = std::move(k); } -Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwiseSquaredDiff::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::SQUARED_DIFF, src1, src2, dst, act_info); } -void ClElementwisePower::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClElementwisePower::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, act_info); auto k = std::make_unique<kernels::ClArithmeticKernel>(); @@ -91,7 +122,10 @@ void ClElementwisePower::configure(const ClCompileContext &compile_context, ITen _kernel = std::move(k); } -Status ClElementwisePower::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClElementwisePower::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClArithmeticKernel::validate(ArithmeticOperation::POWER, src1, src2, dst, act_info); } diff --git a/src/gpu/cl/operators/ClElementwiseOperations.h b/src/gpu/cl/operators/ClElementwiseOperations.h index 120049cb7f..de7c018d75 100644 --- a/src/gpu/cl/operators/ClElementwiseOperations.h +++ b/src/gpu/cl/operators/ClElementwiseOperations.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_ELEMENTWISE_OPERATIONS_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -48,14 +49,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseDivision::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for max @@ -74,14 +82,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseMax::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for min @@ -100,14 +115,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseMin::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for squared difference @@ -126,14 +148,21 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwiseSquaredDiff::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClArithmeticKernel for power @@ -152,14 +181,21 @@ public: * @param[out] dst Destination tensor info. Data types supported:F16/F32. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClElementwisePower::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClElementwiseUnary.cpp b/src/gpu/cl/operators/ClElementwiseUnary.cpp index f94d402c05..914621183e 100644 --- a/src/gpu/cl/operators/ClElementwiseUnary.cpp +++ b/src/gpu/cl/operators/ClElementwiseUnary.cpp @@ -23,9 +23,8 @@ */ #include "src/gpu/cl/operators/ClElementwiseUnary.h" -#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClFill.cpp b/src/gpu/cl/operators/ClFill.cpp index ad22b15cff..817b15ab20 100644 --- a/src/gpu/cl/operators/ClFill.cpp +++ b/src/gpu/cl/operators/ClFill.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClFill.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClFillKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClFill::configure(const ClCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *dst_window) +void ClFill::configure(const ClCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *dst_window) { ARM_COMPUTE_LOG_PARAMS(tensor, constant_value, dst_window); auto k = std::make_unique<kernels::ClFillKernel>(); @@ -45,4 +47,4 @@ Status ClFill::validate(const ITensorInfo *tensor, const PixelValue &constant_va return kernels::ClFillKernel::validate(tensor, constant_value, dst_window); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClFill.h b/src/gpu/cl/operators/ClFill.h index 3bbe27ef71..e13862aa6b 100644 --- a/src/gpu/cl/operators/ClFill.h +++ b/src/gpu/cl/operators/ClFill.h @@ -26,6 +26,7 @@ #include "arm_compute/core/PixelValue.h" #include "arm_compute/core/Window.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -44,7 +45,10 @@ public: * @param[in] constant_value The value used to fill the planes of the tensor * @param[in] window Window to be used in case setting only part of a tensor. Default is nullptr. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *tensor, const PixelValue &constant_value, Window *window = nullptr); + void configure(const CLCompileContext &compile_context, + ITensorInfo *tensor, + const PixelValue &constant_value, + Window *window = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClFill::configure() diff --git a/src/gpu/cl/operators/ClFlatten.cpp b/src/gpu/cl/operators/ClFlatten.cpp index e277c0d7e4..7532532c94 100644 --- a/src/gpu/cl/operators/ClFlatten.cpp +++ b/src/gpu/cl/operators/ClFlatten.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClFlatten.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClReshapeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClFloor.cpp b/src/gpu/cl/operators/ClFloor.cpp index 84f685e381..6790160172 100644 --- a/src/gpu/cl/operators/ClFloor.cpp +++ b/src/gpu/cl/operators/ClFloor.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClFloor.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClFloorKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClFullyConnected.cpp b/src/gpu/cl/operators/ClFullyConnected.cpp index 5845bbc69e..6969ac8ab3 100644 --- a/src/gpu/cl/operators/ClFullyConnected.cpp +++ b/src/gpu/cl/operators/ClFullyConnected.cpp @@ -24,12 +24,13 @@ #include "src/gpu/cl/operators/ClFullyConnected.h" #include "arm_compute/core/Size2D.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" +#include "src/common/utils/Log.h" +#include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/operators/ClConvertFullyConnectedWeights.h" #include "src/gpu/cl/operators/ClFlatten.h" @@ -38,11 +39,8 @@ #include "src/gpu/cl/operators/ClMatMul.h" #include "src/gpu/cl/operators/ClTranspose.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - #include "src/runtime/heuristics/matmul_native/ClMatMulNativeKernelConfig.h" #include "src/runtime/heuristics/matmul_native/IClMatMulNativeKernelConfig.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" #include <algorithm> @@ -62,8 +60,11 @@ inline TensorShape get_reshaped_matmul_tensor(const TensorShape &src) return TensorShape(src.x(), 1, src.y(), src.collapsed_from(2).z()); // Return value optimisation } -Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo &dst, - GEMMLowpOutputStageInfo &gemmlowp_output_stage, ActivationLayerInfo activation_info) +Status construct_gemmlowp_output_stage(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo &dst, + GEMMLowpOutputStageInfo &gemmlowp_output_stage, + ActivationLayerInfo activation_info) { gemmlowp_output_stage.type = GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT; gemmlowp_output_stage.gemmlowp_offset = 0; @@ -73,7 +74,7 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo const auto data_type = src.data_type(); // Configure output stage for quantized case - if(is_data_type_quantized_asymmetric(data_type)) + if (is_data_type_quantized_asymmetric(data_type)) { const QuantizationInfo oq_info = dst.quantization_info(); const UniformQuantizationInfo iq_unif = src.quantization_info().uniform(); @@ -85,15 +86,17 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo const float multiplier = (iq_unif.scale * wq_unif.scale) / output_quant_info.scale; int output_multiplier = 0; int output_shift = 0; - ARM_COMPUTE_RETURN_ON_ERROR(quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); + ARM_COMPUTE_RETURN_ON_ERROR( + quantization::calculate_quantized_multiplier(multiplier, &output_multiplier, &output_shift)); PixelValue type_min{}; PixelValue type_max{}; std::tie(type_min, type_max) = get_min_max(data_type); - if(activation_info.enabled()) + if (activation_info.enabled()) { - std::tie(type_min, type_max) = get_quantized_activation_min_max(activation_info, data_type, output_quant_info); + std::tie(type_min, type_max) = + get_quantized_activation_min_max(activation_info, data_type, output_quant_info); } // Set the GEMMLowp output stage info @@ -109,31 +112,41 @@ Status construct_gemmlowp_output_stage(const ITensorInfo &src, const ITensorInfo return Status{}; } -Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITensorInfo *bias, const ITensorInfo &dst, const FullyConnectedLayerInfo &fc_info, bool use_matmul) +Status validate_mm(const ITensorInfo &src, + const ITensorInfo &weights, + const ITensorInfo *bias, + const ITensorInfo &dst, + const FullyConnectedLayerInfo &fc_info, + bool use_matmul) { // Note : If input is dynamic and data is not batched, use matmul, else use gemm const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; - const bool use_dynamic_gemm = !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul - const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type()); + const bool use_dynamic_gemm = + !use_matmul && !weights.are_values_constant() && transpose_weights; // use dynamic gemm as fallback for matmul + const bool is_quantized = is_data_type_quantized_asymmetric(src.data_type()); - if(use_matmul) + if (use_matmul) { const MatMulInfo m_info = MatMulInfo().adj_rhs(transpose_weights); // Note: LHS is reshaped here to match ClMatMul expectations of batch index - From [M, B0, B1] to [M, 1, B0, B1] TensorInfo lhs_to_use = src.clone()->set_tensor_shape(get_reshaped_matmul_tensor(src.tensor_shape())); - const GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info); + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> t = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + const MatMulKernelInfo kernel_info = t->configure(&lhs_to_use, &weights, m_info); - return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info) : - kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, fc_info.activation_info); + return is_quantized ? kernels::ClMatMulLowpNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, + kernel_info, fc_info.activation_info) + : kernels::ClMatMulNativeKernel::validate(&lhs_to_use, &weights, bias, &dst, kernel_info, + fc_info.activation_info); } else { GEMMLowpOutputStageInfo gemmlowp_output_stage; - ARM_COMPUTE_RETURN_ON_ERROR(construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + construct_gemmlowp_output_stage(src, weights, dst, gemmlowp_output_stage, fc_info.activation_info)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -147,7 +160,7 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe true, // broadcast_bias ActivationLayerInfo()); // activation_info - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src.quantization_info().uniform(); const UniformQuantizationInfo wq_info = weights.quantization_info().uniform(); @@ -158,11 +171,9 @@ Status validate_mm(const ITensorInfo &src, const ITensorInfo &weights, const ITe const QuantizationInfo weights_quantization_info(wq_info.scale, -wq_info.offset); // Validate gemmlowp function - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate(&src.clone()->set_quantization_info(src_quantization_info), - &weights.clone()->set_quantization_info(weights_quantization_info), - bias, - &dst, - gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyCore::validate( + &src.clone()->set_quantization_info(src_quantization_info), + &weights.clone()->set_quantization_info(weights_quantization_info), bias, &dst, gemm_info)); } else { @@ -188,11 +199,15 @@ ClFullyConnected::ClFullyConnected() ClFullyConnected::~ClFullyConnected() = default; -void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // If weights are dynamic and matmul is supported use matmul, else use gemm - if(_use_matmul) + if (_use_matmul) { // Specify whether transpose weights is necessary in matmul info const MatMulInfo mat_info = MatMulInfo().adj_rhs(_transpose_weights); @@ -202,22 +217,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe _lhs_to_use = src->clone()->set_tensor_shape(get_reshaped_matmul_tensor(_lhs_to_use.tensor_shape())); // 2. Use heuristics to get kernel info object - const GPUTarget gpu_target = CLScheduler::get().target(); - std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); - MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info); + const GPUTarget gpu_target = CLScheduler::get().target(); + std::unique_ptr<cl_matmul::IClMatMulNativeKernelConfig> kernel_config = + cl_matmul::ClMatMulNativeKernelConfigurationFactory::create(gpu_target); + MatMulKernelInfo kernel_info = kernel_config->configure(src, weights, mat_info); // 3. Configure relevant matmul kernel - if(_is_quantized) + if (_is_quantized) { _matmul_lowp_native_kernel = std::make_unique<kernels::ClMatMulLowpNativeKernel>(); _matmul_lowp_native_kernel->set_target(gpu_target); - _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info); + _matmul_lowp_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); } else { _matmul_native_kernel = std::make_unique<kernels::ClMatMulNativeKernel>(); _matmul_native_kernel->set_target(gpu_target); - _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, fc_info.activation_info); + _matmul_native_kernel->configure(compile_context, src, weights, bias, dst, kernel_info, + fc_info.activation_info); } } else @@ -238,7 +256,7 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe true, // broadcast_bias fc_info.activation_info); // activation_info - if(_is_quantized) + if (_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -248,8 +266,10 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe TensorInfo src_info = src->clone()->set_quantization_info(src_quantization_info); TensorInfo weights_info = weights->clone()->set_quantization_info(weights_quantization_info); - src_info.set_quantization_info(QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); - weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + src_info.set_quantization_info( + QuantizationInfo(src_quantization_info.uniform().scale, -src_quantization_info.uniform().offset)); + weights_info.set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, + -weights_quantization_info.uniform().offset)); // Configure gemmlowp function _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); @@ -264,16 +284,25 @@ void ClFullyConnected::configure_mm(const CLCompileContext &compile_context, ITe } } -void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. - ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_ERROR_ON((weights->dimension((_use_matmul && _transpose_weights) ? 0 : 1) != + (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // If the fully connected layer is called after a convolution layer, the input tensor must be linearized // Initialize output tensor for flatten - _flattened_src = src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW); + _flattened_src = src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW); // Configure flatten kernel _flatten = std::make_unique<ClFlatten>(); @@ -284,7 +313,11 @@ void ClFullyConnected::configure_conv_fc(const CLCompileContext &compile_context configure_mm(compile_context, &_flattened_src, weights, bias, dst, fc_info); } -void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, +void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info) { // MatMul fuses transpose operation, so we use the first dimension for comparison where appropriate. @@ -294,7 +327,11 @@ void ClFullyConnected::configure_fc_fc(const CLCompileContext &compile_context, configure_mm(compile_context, src, weights, bias, dst, fc_info); } -void ClFullyConnected::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, +void ClFullyConnected::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, FullyConnectedLayerInfo fc_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); @@ -317,8 +354,9 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // 2. MatMul does not support broadcasting batch dimension, and therefore is disabled if fc is batched. // 3. When FC is after convolution and src tensor data layout does not match weights trained data layout (weights conversion kernel is required) const bool is_batched_fc_layer = dst->dimension(1) > 1; - _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); - _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul; + _use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + _dynamic_gemm = !weights->are_values_constant() && _transpose_weights && !_use_matmul; // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -327,11 +365,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // 4) Fully Connected layer -> Fully Connected layer with batches // Check if we have a fully connected layer with batches - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); + _is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -341,7 +379,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso ITensorInfo *weights_used = weights; // Reshape weights if needed - Not needed when matmul is in use as matmul fuses transpose op. - if(_transpose_weights && !_use_matmul) + if (_transpose_weights && !_use_matmul) { // Reshape the weights _reshape_weights = std::make_unique<ClTranspose>(); @@ -351,14 +389,11 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso } // Convert weights if needed - if(_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (_is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Convert weights _convert_weights = std::make_unique<ClConvertFullyConnectedWeights>(); - _convert_weights->configure(compile_context, - weights_used, - &_converted_weights, - src->tensor_shape(), + _convert_weights->configure(compile_context, weights_used, &_converted_weights, src->tensor_shape(), fc_info.weights_trained_layout); weights_used = &_converted_weights; @@ -366,7 +401,7 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso _run_convert_weights = true; } - if(_is_fc_after_conv) + if (_is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches configure_conv_fc(compile_context, src, weights_used, biases, dst, fc_info); @@ -379,60 +414,69 @@ void ClFullyConnected::configure(const CLCompileContext &compile_context, ITenso // Update TensorInfo of final weights used (Need to be done in the end due to padding expansion) _weights_to_use = *weights_used; - if(_use_matmul) + if (_use_matmul) { // Note : MatMul does not use transpose and does not need auxillary memory, so only converted weights are added to aux_mem - _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size()); + _aux_mem[ConvertedWeights] = + MemoryInfo(offset_int_vec(ConvertedWeights), MemoryLifetime::Temporary, _converted_weights.total_size()); } else { // Set auxiliary memory requirements for gemm operators auto gemm_mem_req = (_is_quantized) ? _mm_gemmlowp->workspace() : _mm_gemm->workspace(); - for(unsigned int i = 0; i < gemm_mem_req.size(); ++i) + for (unsigned int i = 0; i < gemm_mem_req.size(); ++i) { _aux_mem[i] = gemm_mem_req[i]; } - if(_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs + if (_aux_mem[1].size > 0 || _aux_mem[2].size > 0) // Persistent weights memory on GEMMs { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch // Keep all the auxiliary tensors in case of dynamic weights as they are recalculated every time _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, - _converted_weights.total_size()); + offset_int_vec(TransposedWeights), _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : MemoryLifetime::Prepare, + _converted_weights.total_size()); } else { // Release permuted weights at the of prepare as they are further transposed by the assembly dispatch - const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; - const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) ? MemoryLifetime::Persistent : MemoryLifetime::Prepare; - - _aux_mem[TransposedWeights] = MemoryInfo( - offset_int_vec(TransposedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft, - _reshaped_weights.total_size()); - _aux_mem[ConvertedWeights] = MemoryInfo( - offset_int_vec(ConvertedWeights), - _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft, - _converted_weights.total_size()); + const auto transposed_wei_lft = (_weights_to_use_idx == offset_int_vec(TransposedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + const auto converted_wei_lft = (_weights_to_use_idx == offset_int_vec(ConvertedWeights)) + ? MemoryLifetime::Persistent + : MemoryLifetime::Prepare; + + _aux_mem[TransposedWeights] = MemoryInfo(offset_int_vec(TransposedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : transposed_wei_lft, + _reshaped_weights.total_size()); + _aux_mem[ConvertedWeights] = MemoryInfo(offset_int_vec(ConvertedWeights), + _dynamic_gemm ? MemoryLifetime::Temporary : converted_wei_lft, + _converted_weights.total_size()); } } - _aux_mem[FlattenedSrc] = MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); + _aux_mem[FlattenedSrc] = + MemoryInfo(offset_int_vec(FlattenedSrc), MemoryLifetime::Temporary, _flattened_src.total_size()); } -Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, +Status ClFullyConnected::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, FullyConnectedLayerInfo fc_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 2); - ARM_COMPUTE_RETURN_ERROR_ON(fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU - && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); + ARM_COMPUTE_RETURN_ERROR_ON( + fc_info.activation_info.enabled() && is_data_type_quantized(src->data_type()) && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::BOUNDED_RELU && + fc_info.activation_info.activation() != ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU); const GPUTarget gpu_target = get_arch_from_target(CLScheduler::get().target()); const bool transpose_weights = fc_info.transpose_weights ? !fc_info.are_weights_reshaped : false; @@ -441,11 +485,20 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei // When using dynamic weights - use matmul kernels. // Note: MatMul does not support broadcasting so fallback with batched cases. const bool is_batched_fc_layer = dst->dimension(1) > 1; - const bool use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && !is_batched_fc_layer && !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); - - const ITensorInfo &flatten_src = TensorInfo(src->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_flatten_shape(src)).set_data_layout(DataLayout::NCHW)); - const ITensorInfo &reshaped_weights = TensorInfo(weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); - const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) ? TensorInfo(*reshaped_weights.clone()) : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()); + const bool use_matmul = gpu_target != GPUTarget::MIDGARD && !weights->are_values_constant() && + !is_batched_fc_layer && + !(src->num_dimensions() > 1 && (src->data_layout() != fc_info.weights_trained_layout)); + + const ITensorInfo &flatten_src = TensorInfo(src->clone() + ->set_is_resizable(true) + .reset_padding() + .set_tensor_shape(compute_flatten_shape(src)) + .set_data_layout(DataLayout::NCHW)); + const ITensorInfo &reshaped_weights = TensorInfo( + weights->clone()->set_is_resizable(true).reset_padding().set_tensor_shape(compute_transposed_shape(*weights))); + const ITensorInfo &converted_weights = (transpose_weights && !use_matmul) + ? TensorInfo(*reshaped_weights.clone()) + : TensorInfo(weights->clone()->set_is_resizable(true).reset_padding()); // With the Fully Connected layer we can have 4 different cases: // 1) Convolution layer -> Fully Connected layer without batches @@ -456,10 +509,10 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei const ITensorInfo *src_to_use = src; const ITensorInfo *weights_to_use = weights; - if(biases != nullptr) + if (biases != nullptr) { ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); - if(is_data_type_quantized(src->data_type())) + if (is_data_type_quantized(src->data_type())) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -470,11 +523,11 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei } // Check if FC is after conv (flatten kernel is run in case where FC is after conv.) - if(is_batched_fc_layer) + if (is_batched_fc_layer) { - is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && (std::equal(src->tensor_shape().cbegin() + 3, - src->tensor_shape().cend(), - dst->tensor_shape().cbegin() + 1)); + is_fc_after_conv = (TensorShape::num_max_dimensions >= 4) && + (std::equal(src->tensor_shape().cbegin() + 3, src->tensor_shape().cend(), + dst->tensor_shape().cbegin() + 1)); } else { @@ -482,29 +535,28 @@ Status ClFullyConnected::validate(const ITensorInfo *src, const ITensorInfo *wei } // Transpose kernel does not run when matmul is supported as matmul fuses transpose op. - if(transpose_weights && !use_matmul) + if (transpose_weights && !use_matmul) { // Validate reshape weights kernel ARM_COMPUTE_RETURN_ON_ERROR(ClTranspose::validate(weights, &reshaped_weights)); weights_to_use = &reshaped_weights; } - if(is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) + if (is_fc_after_conv && (src->data_layout() != fc_info.weights_trained_layout)) { // Validate convert weights kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate(weights_to_use, - &converted_weights, - src->tensor_shape(), - fc_info.weights_trained_layout)); + ARM_COMPUTE_RETURN_ON_ERROR(ClConvertFullyConnectedWeights::validate( + weights_to_use, &converted_weights, src->tensor_shape(), fc_info.weights_trained_layout)); weights_to_use = &converted_weights; } - if(is_fc_after_conv) + if (is_fc_after_conv) { // Fully Connected layer after a Convolution Layer without batches // K Index of matrix multiplication. MatMul performs transpose in kernel, so index is 0 when matmul and transpose enabled const int weight_idx = (use_matmul && transpose_weights) ? 0 : 1; - ARM_COMPUTE_RETURN_ERROR_ON((weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); + ARM_COMPUTE_RETURN_ERROR_ON( + (weights_to_use->dimension(weight_idx) != (src->dimension(0) * src->dimension(1) * src->dimension(2)))); // Validate flatten kernel ARM_COMPUTE_RETURN_ON_ERROR(ClFlatten::validate(src, &flatten_src)); @@ -539,24 +591,24 @@ void ClFullyConnected::run(ITensorPack &tensors) CLAuxTensorHandler weights(_weights_to_use_idx, _weights_to_use, tensors, false); // Linearize input if it comes from a convolutional layer - if(_is_fc_after_conv) + if (_is_fc_after_conv) { - ITensorPack flatten_pack{ { ACL_SRC, src }, { ACL_DST, flattened_src.get() } }; + ITensorPack flatten_pack{{ACL_SRC, src}, {ACL_DST, flattened_src.get()}}; _flatten->run(flatten_pack); } ITensorPack gemm_pack = tensors; gemm_pack.add_const_tensor(ACL_SRC_0, (_is_fc_after_conv) ? flattened_src.get() : src); - if(_weights_to_use_idx != ACL_SRC_1) + if (_weights_to_use_idx != ACL_SRC_1) { gemm_pack.add_const_tensor(ACL_SRC_1, weights.get()); } // Run MatMul Op - if(_use_matmul) + if (_use_matmul) { // Run matmul kernels for matrix multiplication - if(_is_quantized) + if (_is_quantized) { CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, gemm_pack, true); } @@ -568,7 +620,7 @@ void ClFullyConnected::run(ITensorPack &tensors) else { // Run matrix multiply - if(_is_quantized) + if (_is_quantized) { _mm_gemmlowp->run(gemm_pack); } @@ -582,7 +634,7 @@ void ClFullyConnected::run(ITensorPack &tensors) void ClFullyConnected::prepare(ITensorPack &tensors) { // Note : Running prepare() each run when _use_matmul is true is unnecessary unless weights conversion is needed. - if(!_is_prepared || _dynamic_gemm) + if (!_is_prepared || _dynamic_gemm) { #ifdef ARM_COMPUTE_ASSERTS_ENABLED ++_asrt_prepare_count; @@ -598,10 +650,10 @@ void ClFullyConnected::prepare(ITensorPack &tensors) const ITensor *cur_weights = weights; // Reshape weights if needed. Disabled when matmul kernels are enabled as matmul fuses transpose. - if(_transpose_weights && !_use_matmul) + if (_transpose_weights && !_use_matmul) { // Run reshape weights kernel and mark weights as unused - ITensorPack transpose_pack{ { ACL_SRC, weights }, { ACL_DST, reshaped_weights.get() } }; + ITensorPack transpose_pack{{ACL_SRC, weights}, {ACL_DST, reshaped_weights.get()}}; _reshape_weights->run(transpose_pack); cur_weights->mark_as_unused(); @@ -609,9 +661,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors) } // Convert weights if needed - if(_run_convert_weights) + if (_run_convert_weights) { - ITensorPack convert_pack{ { ACL_SRC, cur_weights }, { ACL_DST, converted_weights.get() } }; + ITensorPack convert_pack{{ACL_SRC, cur_weights}, {ACL_DST, converted_weights.get()}}; _convert_weights->run(convert_pack); cur_weights->mark_as_unused(); @@ -622,9 +674,9 @@ void ClFullyConnected::prepare(ITensorPack &tensors) gemm_pack.add_const_tensor(ACL_SRC_1, cur_weights); // Prepare GEMM prepare and release unused weights - if(_dynamic_gemm || !_use_matmul) + if (_dynamic_gemm || !_use_matmul) { - if(!_is_quantized) + if (!_is_quantized) { _mm_gemm->prepare(gemm_pack); } diff --git a/src/gpu/cl/operators/ClFullyConnected.h b/src/gpu/cl/operators/ClFullyConnected.h index d975859d87..0621238ab5 100644 --- a/src/gpu/cl/operators/ClFullyConnected.h +++ b/src/gpu/cl/operators/ClFullyConnected.h @@ -47,7 +47,7 @@ namespace kernels { class ClMatMulNativeKernel; class ClMatMulLowpNativeKernel; -} +} // namespace kernels /** Basic function to compute a Fully Connected layer on OpenCL. This function calls the following OpenCL kernels: * * -# @ref opencl::kernels::ClIm2ColKernel (called when the input comes from a convolutional layer) @@ -88,7 +88,11 @@ public: * Data type supported: Same as @p src. * @param[in] fc_info (Optional) Fully connected layer additional info */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -96,18 +100,36 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, FullyConnectedLayerInfo fc_info = FullyConnectedLayerInfo()); // Inherited methods overriden - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: - void configure_fc_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); - void configure_conv_fc(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); - void configure_mm(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *bias, ITensorInfo *dst, const FullyConnectedLayerInfo &fc_info); + void configure_fc_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_conv_fc(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); + void configure_mm(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *bias, + ITensorInfo *dst, + const FullyConnectedLayerInfo &fc_info); private: enum AuxTensorIdx @@ -134,19 +156,19 @@ private: TensorInfo _reshaped_weights{}; TensorInfo _lhs_to_use{}; TensorInfo _weights_to_use{}; - int _weights_to_use_idx{ ACL_SRC_1 }; + int _weights_to_use_idx{ACL_SRC_1}; - bool _run_convert_weights{ false }; - bool _transpose_weights{ false }; - bool _dynamic_gemm{ false }; - bool _use_matmul{ false }; + bool _run_convert_weights{false}; + bool _transpose_weights{false}; + bool _dynamic_gemm{false}; + bool _use_matmul{false}; - bool _is_fc_after_conv{ true }; - bool _is_quantized{ false }; - bool _is_prepared{ false }; + bool _is_fc_after_conv{true}; + bool _is_quantized{false}; + bool _is_prepared{false}; #ifdef ARM_COMPUTE_ASSERTS_ENABLED - int _asrt_run_count {}; + int _asrt_run_count{}; int _asrt_prepare_count{}; #endif // ARM_COMPUTE_ASSERTS_ENABLED }; diff --git a/src/gpu/cl/operators/ClGemm.cpp b/src/gpu/cl/operators/ClGemm.cpp index 7e331a86f3..815c254c69 100644 --- a/src/gpu/cl/operators/ClGemm.cpp +++ b/src/gpu/cl/operators/ClGemm.cpp @@ -33,11 +33,12 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" #include "arm_compute/runtime/ITensorAllocator.h" +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/utils/helpers/float_ops.h" @@ -45,8 +46,6 @@ #include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "src/runtime/CL/gemm/CLGEMMKernelSelection.h" #include "src/runtime/CL/gemm_auto_heuristics/CLGEMMAutoHeuristics.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" #include "utils/TypePrinter.h" @@ -67,35 +66,43 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) return kernel_type == CLGEMMKernelType::NATIVE ? false : true; } //Automatically select between mlgo (prioritized) and default heuristics for gemm kernel type -inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) +inline CLGEMMKernelType +auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run, bool constant_weights) { - if(!constant_weights) + if (!constant_weights) { return CLGEMMKernelType::NATIVE; } auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) + if (bool(gemm_kernel)) { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) + if (validate_gemm_kernel(gemm_kernel.gemm_type)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } } gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info) +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -103,12 +110,14 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs gemm_kernel_info.lhs_info = lhs_info; gemm_kernel_info.rhs_info = rhs_info; gemm_kernel_info.has_pad_y = false; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } gemm_kernel_info.has_pad_y = true; - if(!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } @@ -116,49 +125,65 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs } //Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, - const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output) +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Validate lhs_info and rhs_info for reshaped kernel -inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, - const ITensorInfo *output, GEMMKernelInfo gemm_kernel_info, bool reinterpret_input_as_3d) +inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + GEMMKernelInfo gemm_kernel_info, + bool reinterpret_input_as_3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped kernel TensorInfo tmp_a_info{}; TensorInfo tmp_b_info{}; // Validate reshape LHS kernel - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); - if(!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) + auto_init_if_empty(tmp_a_info, + a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, reinterpret_input_as_3d))); + if (!bool(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, reinterpret_input_as_3d))) { return false; } // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } // Validate mm kernel gemm_kernel_info.lhs_info = lhs_info; gemm_kernel_info.rhs_info = rhs_info; - if(!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, rhs_info, gemm_kernel_info))) + if (!bool(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, 1.f, 0.f, lhs_info, + rhs_info, gemm_kernel_info))) { return false; } @@ -166,21 +191,32 @@ inline bool validate_lhs_rhs_info_reshaped(const GEMMLHSMatrixInfo &lhs_info, co } //Automatically select between mlgo (prioritized) and default heuristics for reshaped kernel configs -inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, GEMMKernelInfo kernel_info, const ITensorInfo *a, const ITensorInfo *b, - const ITensorInfo *c, const ITensorInfo *output, bool reinterpret_input_as_3d) +inline std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery query, + GEMMKernelInfo kernel_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + bool reinterpret_input_as_3d) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, reinterpret_input_as_3d)) + if (validate_lhs_rhs_info_reshaped(config.lhs_info, config.rhs_info, a, b, c, output, kernel_info, + reinterpret_input_as_3d)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), + to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } // namespace @@ -200,18 +236,24 @@ ClGemm::ClGemm() { } -void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -225,24 +267,32 @@ void ClGemm::configure_native(const CLCompileContext &compile_context, ITensorIn // Set the target for the kernels _mm_native_kernel->set_target(gpu_target); - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); // Configure and tune matrix multiply kernel - _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info); + _mm_native_kernel->configure(compile_context, a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, + kernel_info); } -void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -261,32 +311,42 @@ void ClGemm::configure_reshaped(const CLCompileContext &compile_context, ITensor GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, - c, output, gemm_info.reinterpret_input_as_3d()); + std::tie(lhs_info, rhs_info) = + auto_select_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, + kernel_info, a, b, c, output, gemm_info.reinterpret_input_as_3d()); _reshape_lhs_kernel->configure(compile_context, a, &_tmp_a, lhs_info, gemm_info.reinterpret_input_as_3d()); _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); // Configure and tune matrix multiply kernel - _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_kernel->configure(compile_context, &_tmp_a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); // Request memory for LHS and RHS reshape matrix _aux_mem[LhsReshape] = MemoryInfo(offset_int_vec(LhsReshape), MemoryLifetime::Temporary, _tmp_a.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -304,7 +364,8 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }, kernel_info, a, b, c, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}, kernel_info, a, b, c, output); // Transpose matrix _reshape_rhs_kernel->configure(compile_context, b, &_tmp_b, rhs_info); @@ -315,24 +376,33 @@ void ClGemm::configure_reshaped_only_rhs(const CLCompileContext &compile_context // Configure matrix multiply kernel with no y padding support kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, + kernel_info); // Request memory for RHS reshape matrix - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, - const GEMMInfo &gemm_info) +void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const GPUTarget gpu_target = CLScheduler::get().target(); - bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const GPUTarget gpu_target = CLScheduler::get().target(); + bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -350,9 +420,10 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co GEMMRHSMatrixInfo rhs_info{}; // Pick up the GEMM configuration - auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; // Force H0 to 4 in order to use the MMUL extension rhs_info.h0 = 4; @@ -361,13 +432,22 @@ void ClGemm::configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_co // Configure matrix multiply kernel with no y padding support kernel_info.has_pad_y = false; - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, rhs_info, kernel_info); + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, &_tmp_b, c, output, alpha, beta, lhs_info, + rhs_info, kernel_info); // Request memory for RHS reshape matrix - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); } -Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -376,12 +456,12 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const const GPUTarget gpu_target = CLScheduler::get().target(); DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -392,15 +472,23 @@ Status ClGemm::validate_native(const ITensorInfo *a, const ITensorInfo *b, const kernel_info.broadcast_bias = broadcast_bias; kernel_info.activation_info = gemm_info.activation_info(); - auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); + auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate(a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyNativeKernel::validate( + a, b, c, output, alpha, beta, config.lhs_info, config.rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -412,12 +500,12 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con const GPUTarget gpu_target = CLScheduler::get().target(); DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -433,23 +521,33 @@ Status ClGemm::validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, con // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = + select_default_gemm_config_reshaped(auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; - auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape(compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); + auto_init_if_empty(tmp_a_info, a->clone()->set_tensor_shape( + compute_lhs_reshaped_shape(*a, lhs_info, gemm_info.reinterpret_input_as_3d()))); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmReshapeLhsMatrixKernel::validate(a, &tmp_a_info, lhs_info, gemm_info.reinterpret_input_as_3d())); auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedKernel::validate(&tmp_a_info, &tmp_b_info, c, output, alpha, + beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -460,12 +558,12 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf const GPUTarget gpu_target = CLScheduler::get().target(); const DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -481,24 +579,33 @@ Status ClGemm::validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInf // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info)); // Validate matrix multiply kernel_info.has_pad_y = false; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); kernel_info.has_pad_y = true; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_UNUSED(alpha); ARM_COMPUTE_UNUSED(output); @@ -508,12 +615,12 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens const GPUTarget gpu_target = CLScheduler::get().target(); const DataType data_type = a->data_type(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - const bool broadcast_bias = gemm_info.broadcast_bias(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const bool broadcast_bias = gemm_info.broadcast_bias(); GEMMKernelInfo kernel_info; kernel_info.m = m; @@ -529,9 +636,10 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails - const auto gemm_config = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, data_type, m, n, k, batch_size }); - lhs_info = gemm_config.lhs_info; - rhs_info = gemm_config.rhs_info; + const auto gemm_config = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, data_type, m, n, k, batch_size}); + lhs_info = gemm_config.lhs_info; + rhs_info = gemm_config.rhs_info; // Force H0 to 4 in order to use the MMUL extension rhs_info.h0 = 4; @@ -540,12 +648,20 @@ Status ClGemm::validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITens // Validate matrix multiply kernel_info.has_pad_y = false; - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmMatrixMultiplyReshapedOnlyRhsMMULKernel::validate( + a, &tmp_b_info, c, output, alpha, beta, lhs_info, rhs_info, kernel_info)); return Status{}; } -void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +void ClGemm::configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); @@ -558,20 +674,21 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, _is_prepared = gemm_info.retain_internal_weights(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); // Select GEMMType - _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ CLScheduler::get().target(), a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run, - b->are_values_constant()); + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{CLScheduler::get().target(), a->data_type(), m, n, k, batch_size}, + _reshape_b_only_on_first_run, b->are_values_constant()); const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - switch(_gemm_kernel_type) + switch (_gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -600,35 +717,41 @@ void ClGemm::configure(const CLCompileContext &compile_context, ITensorInfo *a, } } -Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info) +Status ClGemm::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info) { // Get the GPU target bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); // Check data type early because the auto_select_gemm_kernel has assertions on supported data types ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::F32, DataType::F16); // Select GEMMType - CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery - { - CLScheduler::get().target(), - a->data_type(), - m, - n, - k, - batch_size, - }, - gemm_info.reshape_b_only_on_first_run(), b->are_values_constant()); + CLGEMMKernelType gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{ + CLScheduler::get().target(), + a->data_type(), + m, + n, + k, + batch_size, + }, + gemm_info.reshape_b_only_on_first_run(), b->are_values_constant()); const bool fuse_add_c = (!(helpers::float_ops::is_zero(beta)) && c != nullptr); const ITensorInfo *c_to_use = fuse_add_c ? c : nullptr; - switch(gemm_kernel_type) + switch (gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -647,7 +770,8 @@ Status ClGemm::validate(const ITensorInfo *a, const ITensorInfo *b, const ITenso } case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: { - ARM_COMPUTE_RETURN_ON_ERROR(validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + validate_reshaped_only_rhs_mmul(a, b, c_to_use, output, alpha, beta, gemm_info)); break; } default: @@ -674,7 +798,7 @@ void ClGemm::run(ITensorPack &tensors) prepare(tensors); // Run matrix multiply kernel - switch(_gemm_kernel_type) + switch (_gemm_kernel_type) { case CLGEMMKernelType::NATIVE: { @@ -684,13 +808,13 @@ void ClGemm::run(ITensorPack &tensors) case CLGEMMKernelType::RESHAPED: { // Run interleave kernel - ITensorPack reshape_lhs_pack{ { ACL_SRC, lhs }, { ACL_DST, lhs_reshaped.get() } }; + ITensorPack reshape_lhs_pack{{ACL_SRC, lhs}, {ACL_DST, lhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_lhs_kernel, reshape_lhs_pack, false); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // Copy original tensor pack and overwrite lhs and rhs with reshaped counterparts @@ -698,7 +822,7 @@ void ClGemm::run(ITensorPack &tensors) gemm_reshaped_pack.add_const_tensor(ACL_SRC_0, lhs_reshaped.get()); gemm_reshaped_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED) { CLScheduler::get().enqueue_op(*_mm_reshaped_kernel, gemm_reshaped_pack, true); } @@ -706,10 +830,10 @@ void ClGemm::run(ITensorPack &tensors) } case CLGEMMKernelType::RESHAPED_ONLY_RHS: { - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement @@ -722,7 +846,7 @@ void ClGemm::run(ITensorPack &tensors) ITensorPack gemm_reshaped_onlyrhs_pack(tensors); gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(has_pad_y) + if (has_pad_y) { ARM_COMPUTE_ERROR_ON(has_pad_y); } @@ -734,10 +858,10 @@ void ClGemm::run(ITensorPack &tensors) } case CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL: { - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run transpose kernel - ITensorPack reshape_rhs_pack{ { ACL_SRC, rhs }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, rhs}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, false); } // In case of RESHAPED_ONLY_RHS, we need to check the padding requirement @@ -750,7 +874,7 @@ void ClGemm::run(ITensorPack &tensors) ITensorPack gemm_reshaped_onlyrhs_pack(tensors); gemm_reshaped_onlyrhs_pack.add_const_tensor(ACL_SRC_1, rhs_reshaped.get()); - if(has_pad_y) + if (has_pad_y) { ARM_COMPUTE_ERROR_ON(has_pad_y); } @@ -769,20 +893,22 @@ void ClGemm::run(ITensorPack &tensors) void ClGemm::prepare(ITensorPack &constants) { - if(!_is_prepared) + if (!_is_prepared) { - const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); - ICLTensor *rhs_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape))); + const ITensor *src1 = constants.get_const_tensor(ACL_SRC_1); + ICLTensor *rhs_aux = + utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(RhsReshape))); // If memory for RHS is persistent and src1 is provided re-transform else assume that RHS is transformed - if((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) + if ((_aux_mem[AuxTensorIdx::RhsReshape].lifetime == MemoryLifetime::Persistent) && + (src1 != nullptr && rhs_aux != nullptr) && rhs_aux) { ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Transforming RHS Matrix!"); CLAuxTensorHandler rhs_reshaped(_tmp_b, *rhs_aux); ARM_COMPUTE_ERROR_ON(rhs_reshaped.get()->cl_buffer().get() == nullptr); - ITensorPack reshape_rhs_pack{ { ACL_SRC, src1 }, { ACL_DST, rhs_reshaped.get() } }; + ITensorPack reshape_rhs_pack{{ACL_SRC, src1}, {ACL_DST, rhs_reshaped.get()}}; CLScheduler::get().enqueue_op(*_reshape_rhs_kernel, reshape_rhs_pack, true); } _is_prepared = true; diff --git a/src/gpu/cl/operators/ClGemm.h b/src/gpu/cl/operators/ClGemm.h index 11f9f2b3d8..85dc1d6c8f 100644 --- a/src/gpu/cl/operators/ClGemm.h +++ b/src/gpu/cl/operators/ClGemm.h @@ -90,30 +90,95 @@ public: * if the reshape of matrix B should happen only for the first run. GEMMInfo also contains information about the reshaping * in case matrix A and matrix B have been already transformed. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemm::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: - void configure_native(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + void configure_native(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + void configure_reshaped_only_rhs_mmul(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); - static Status validate_native(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_only_rhs(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); - static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, float alpha, float beta, const GEMMInfo &gemm_info); + static Status validate_native(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); + static Status validate_reshaped_only_rhs_mmul(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + float alpha, + float beta, + const GEMMInfo &gemm_info); private: enum AuxTensorIdx diff --git a/src/gpu/cl/operators/ClGemmConv2d.cpp b/src/gpu/cl/operators/ClGemmConv2d.cpp index 5620471ff9..55d815a1ef 100644 --- a/src/gpu/cl/operators/ClGemmConv2d.cpp +++ b/src/gpu/cl/operators/ClGemmConv2d.cpp @@ -28,10 +28,12 @@ #include "arm_compute/core/Size2D.h" #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/core/utils/quantization/AsymmHelpers.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/AutoConfiguration.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClActivationKernel.h" @@ -41,8 +43,6 @@ #include "src/gpu/cl/operators/ClGemm.h" #include "src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" namespace arm_compute @@ -53,18 +53,38 @@ using namespace utils::cast; namespace opencl { ClGemmConv2d::ClGemmConv2d() - : _weights_reshape_kernel(nullptr), _im2col_kernel(nullptr), _mm_gemm(nullptr), _mm_gemmlowp(nullptr), _col2im_kernel(nullptr), _activation_kernel(nullptr), _im2col_output(), _weights_reshaped(), - _gemm_output(), _skip_im2col(false), _skip_col2im(false), _is_quantized(false), _fuse_activation(true), _append_bias(false), _is_prepared(false), _aux_mem(AuxTensorIdx::Count) + : _weights_reshape_kernel(nullptr), + _im2col_kernel(nullptr), + _mm_gemm(nullptr), + _mm_gemmlowp(nullptr), + _col2im_kernel(nullptr), + _activation_kernel(nullptr), + _im2col_output(), + _weights_reshaped(), + _gemm_output(), + _skip_im2col(false), + _skip_col2im(false), + _is_quantized(false), + _fuse_activation(true), + _append_bias(false), + _is_prepared(false), + _aux_mem(AuxTensorIdx::Count) { } ClGemmConv2d::~ClGemmConv2d() = default; -void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, +void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info) + int gemm_3d_depth, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights); - ARM_COMPUTE_ERROR_THROW_ON(validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); + ARM_COMPUTE_ERROR_THROW_ON( + validate_mm(src, weights, biases, dst, gemmlowp_output_stage, gemm_3d_depth, _skip_im2col, act_info)); const GEMMInfo &gemm_info = GEMMInfo(false, // is_a_reshaped false, // is_b_reshaped @@ -77,18 +97,20 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I false, // fp_mixed_precision true, // broadcast_bias act_info // activation_info - ); + ); - TensorInfo tmp_src{ *src }; - if(_is_quantized) + TensorInfo tmp_src{*src}; + if (_is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset const QuantizationInfo input_quantization_info = src->quantization_info(); const QuantizationInfo weights_quantization_info = weights->quantization_info(); - tmp_src.set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + tmp_src.set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); _mm_gemmlowp = std::make_unique<ClGemmLowpMatrixMultiplyCore>(); _mm_gemmlowp->configure(compile_context, &tmp_src, weights, biases, dst, gemm_info); @@ -97,7 +119,7 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I weights->set_quantization_info(weights_quantization_info); auto mm_mem_req = _mm_gemmlowp->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } @@ -108,15 +130,21 @@ void ClGemmConv2d::configure_mm(const ClCompileContext &compile_context, const I _mm_gemm = std::make_unique<ClGemm>(); _mm_gemm->configure(compile_context, &tmp_src, weights, biases, dst, 1.0f, 1.0f, gemm_info); auto mm_mem_req = _mm_gemm->workspace(); - for(unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) + for (unsigned int cont = 0; cont < mm_mem_req.size(); ++cont) { _aux_mem[cont] = mm_mem_req[cont]; } } } -Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const GEMMLowpOutputStageInfo &gemmlowp_output_stage, int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info) +Status ClGemmConv2d::validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info) { const bool is_quantized = is_data_type_quantized_asymmetric(src->data_type()); @@ -131,9 +159,9 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig false, // fp_mixed_precision true, // broadcast_bias act_info // activation_info - ); + ); - if(is_quantized) + if (is_quantized) { // Since we need negative offsets for computing convolution, we need to change QuantizationInfo() // Extract and negate input and weights offset @@ -142,8 +170,10 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig std::unique_ptr<ITensorInfo> src_qa = src->clone(); std::unique_ptr<ITensorInfo> weights_qa = weights->clone(); - src_qa->set_quantization_info(QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); - weights_qa->set_quantization_info(QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); + src_qa->set_quantization_info( + QuantizationInfo(input_quantization_info.uniform().scale, -input_quantization_info.uniform().offset)); + weights_qa->set_quantization_info( + QuantizationInfo(weights_quantization_info.uniform().scale, -weights_quantization_info.uniform().offset)); // Perform validation step on GEMMLowp return ClGemmLowpMatrixMultiplyCore::validate(src_qa.get(), weights_qa.get(), biases, dst, gemm_info); @@ -155,14 +185,17 @@ Status ClGemmConv2d::validate_mm(const ITensorInfo *src, const ITensorInfo *weig } } -void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) +void ClGemmConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, weights, dst); - ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, - conv2d_info, - weights_info)); + ARM_COMPUTE_ERROR_THROW_ON(ClGemmConv2d::validate(src, weights, biases, dst, conv2d_info, weights_info)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv2d_info, weights_info); const DataType data_type = src->data_type(); @@ -180,7 +213,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf _is_prepared = weights_info.retain_internal_weights(); _is_quantized = is_data_type_quantized_asymmetric(src->data_type()); - _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + _skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); _skip_col2im = data_layout == DataLayout::NHWC; // Only for quantize there are few cases where we cannot fuse the activation function in GEMM @@ -197,12 +231,8 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf // Get convolved dimensions unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv2d_info.conv_info, - conv2d_info.dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; @@ -210,28 +240,31 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf _append_bias = false; _weights_reshape_kernel = std::make_unique<kernels::ClWeightsReshapeKernel>(); - if(conv2d_info.num_groups != 1 && biases != nullptr) + if (conv2d_info.num_groups != 1 && biases != nullptr) { // num_groups != 1 can only be for NCHW // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor biases_to_use = nullptr; _append_bias = true; - _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, conv2d_info.num_groups); + _weights_reshape_kernel->configure(compile_context, weights, biases, &_weights_reshaped, + conv2d_info.num_groups); } else { - _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, conv2d_info.num_groups); + _weights_reshape_kernel->configure(compile_context, weights, nullptr, &_weights_reshaped, + conv2d_info.num_groups); } // Create tensor to store im2col reshaped inputs - if(!_skip_im2col) + if (!_skip_im2col) { // Configure and tune im2col. im2col output shape is auto-initialized _im2col_kernel = std::make_unique<opencl::kernels::ClIm2ColKernel>(); // Set the GPU target for im2col _im2col_kernel->set_target(CLScheduler::get().target()); - _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); + _im2col_kernel->configure(compile_context, src, &_im2col_output, Size2D(kernel_width, kernel_height), + conv2d_info.conv_info, _append_bias, conv2d_info.dilation, conv2d_info.num_groups); // Set quantization info _im2col_output.set_quantization_info(src->quantization_info()); @@ -242,7 +275,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf } // Create GEMM output tensor - if(!_skip_col2im) + if (!_skip_col2im) { TensorShape shape_gemm; @@ -263,7 +296,7 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf gemmlowp_output_stage.gemmlowp_offset = 0; // Configure output stage for quantized case - if(_is_quantized) + if (_is_quantized) { const auto output_quant_info = (dst->total_size() == 0) ? iq_info : oq_info; const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); @@ -286,16 +319,16 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf auto min_activation = min_val.get<int32_t>(); auto max_activation = max_val.get<int32_t>(); - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { - if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); } else { @@ -313,48 +346,60 @@ void ClGemmConv2d::configure(const CLCompileContext &compile_context, ITensorInf // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); + configure_mm(compile_context, gemm_input_to_use, &_weights_reshaped, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, conv2d_info.act_info); - if(!_skip_col2im) + if (!_skip_col2im) { // Set the GPU target for col2im _col2im_kernel = std::make_unique<opencl::kernels::ClCol2ImKernel>(); _col2im_kernel->set_target(CLScheduler::get().target()); // Configure and tune Col2Im - _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups); + _col2im_kernel->configure(compile_context, gemm_output_to_use, dst, Size2D(conv_w, conv_h), + conv2d_info.num_groups); CLScheduler::get().tune_kernel_static(*_col2im_kernel.get()); } ARM_COMPUTE_ERROR_ON_MSG((dst->dimension(idx_width) != conv_w) || (dst->dimension(idx_height) != conv_h), "Output shape does not match the expected one"); - if(!_fuse_activation) + if (!_fuse_activation) { _activation_kernel = std::make_unique<opencl::kernels::ClActivationKernel>(); _activation_kernel->configure(compile_context, dst, nullptr, conv2d_info.act_info); } - _aux_mem[Im2ColOutput] = MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); - _aux_mem[WeightsReshaped] = MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); - _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); + _aux_mem[Im2ColOutput] = + MemoryInfo(offset_int_vec(Im2ColOutput), MemoryLifetime::Temporary, _im2col_output.total_size()); + _aux_mem[WeightsReshaped] = + MemoryInfo(offset_int_vec(WeightsReshaped), MemoryLifetime::Persistent, _weights_reshaped.total_size()); + _aux_mem[GemmOutput] = MemoryInfo(offset_int_vec(GemmOutput), MemoryLifetime::Temporary, _gemm_output.total_size()); } -Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const Conv2dInfo &conv2d_info, +Status ClGemmConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(src, weights, dst); ARM_COMPUTE_RETURN_ERROR_ON_MSG(weights_info.are_reshaped(), "Weights already reshaped are not supported!"); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const bool is_quantized_per_channel = is_data_type_quantized_per_channel(weights->data_type()); - if(!is_quantized_per_channel) + if (!is_quantized_per_channel) { ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(src, weights); } ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_LAYOUT(src, weights); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), "Grouping (num_groups != 1) with NHWC data layout is not supported"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), "Grouping (num_groups != 1) is not supported with QASYMM8"); - ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && (src->data_layout() == DataLayout::NCHW)); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_layout() != DataLayout::NCHW), + "Grouping (num_groups != 1) with NHWC data layout is not supported"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG((conv2d_info.num_groups != 1) && (src->data_type() == DataType::QASYMM8), + "Grouping (num_groups != 1) is not supported with QASYMM8"); + ARM_COMPUTE_RETURN_ERROR_ON(((src->dimension(2) / weights->dimension(2)) != conv2d_info.num_groups) && + (src->data_layout() == DataLayout::NCHW)); const DataLayout data_layout = src->data_layout(); const DataType data_type = src->data_type(); @@ -374,18 +419,19 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights const ITensorInfo *gemm_output_to_use = dst; const ITensorInfo *weights_to_use = weights; const bool is_quantized = is_data_type_quantized_asymmetric(data_type); - const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && conv2d_info.conv_info.stride().first == 1 - && conv2d_info.conv_info.stride().second == 1); - const bool skip_col2im = data_layout == DataLayout::NHWC; - bool fuse_activation = true; + const bool skip_im2col = (data_layout == DataLayout::NHWC && kernel_width == 1 && kernel_height == 1 && + conv2d_info.conv_info.stride().first == 1 && conv2d_info.conv_info.stride().second == 1); + const bool skip_col2im = data_layout == DataLayout::NHWC; + bool fuse_activation = true; - ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != src->dimension(idx_channel)); + ARM_COMPUTE_RETURN_ERROR_ON((weights->dimension(idx_channel) * conv2d_info.num_groups) != + src->dimension(idx_channel)); ARM_COMPUTE_RETURN_ERROR_ON(weights->num_dimensions() > 4); // Validate biases - if(biases != nullptr) + if (biases != nullptr) { - if(is_quantized) + if (is_quantized) { ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(biases, 1, DataType::S32); } @@ -397,7 +443,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights ARM_COMPUTE_RETURN_ERROR_ON(biases->num_dimensions() > 1); } - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { ARM_COMPUTE_ERROR_ON(conv2d_info.act_info.b() > conv2d_info.act_info.a()); } @@ -406,48 +452,50 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights unsigned int conv_w = 0; unsigned int conv_h = 0; - std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), - src->dimension(idx_height), - kernel_width, - kernel_height, - conv2d_info.conv_info, - conv2d_info.dilation); + std::tie(conv_w, conv_h) = scaled_dimensions(src->dimension(idx_width), src->dimension(idx_height), kernel_width, + kernel_height, conv2d_info.conv_info, conv2d_info.dilation); unsigned int mat_weights_cols = num_kernels / conv2d_info.num_groups; const ITensorInfo *biases_to_use = biases; bool append_bias = false; - if(conv2d_info.num_groups != 1 && biases != nullptr) + if (conv2d_info.num_groups != 1 && biases != nullptr) { // num_groups != 1 can only be for NCHW // Since it is missing an utility function to reshape the biases, we append the biases into the weights tensor - biases_to_use = nullptr; - append_bias = true; - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); + biases_to_use = nullptr; + append_bias = true; + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, true, conv2d_info.num_groups), 1, data_type); } else { - weights_reshaped_info = TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); + weights_reshaped_info = + TensorInfo(compute_weights_reshaped_shape(*weights, false, conv2d_info.num_groups), 1, data_type); } weights_to_use = &weights_reshaped_info; - if(!skip_im2col) + if (!skip_im2col) { const Size2D kernel_dims(kernel_width, kernel_height); // Output tensor auto initialization if not yet initialized - TensorShape expected_output_shape = compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups == 1, conv2d_info.num_groups); + TensorShape expected_output_shape = + compute_im2col_conv_shape(src, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, + conv2d_info.num_groups == 1, conv2d_info.num_groups); auto_init_if_empty(im2col_reshaped_info, src->clone()->set_tensor_shape(expected_output_shape)); - ARM_COMPUTE_RETURN_ON_ERROR(opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, append_bias, conv2d_info.dilation, conv2d_info.num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + opencl::kernels::ClIm2ColKernel::validate(src, &im2col_reshaped_info, kernel_dims, conv2d_info.conv_info, + append_bias, conv2d_info.dilation, conv2d_info.num_groups)); gemm_input_to_use = &im2col_reshaped_info; } // Create GEMM output tensor - if(!skip_col2im) + if (!skip_col2im) { TensorShape shape_gemm; @@ -465,7 +513,7 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights gemmlowp_output_stage.gemmlowp_offset = 0; gemmlowp_output_stage.is_quantized_per_channel = is_quantized_per_channel; - if(is_quantized) + if (is_quantized) { const UniformQuantizationInfo iq_info = src->quantization_info().uniform(); const UniformQuantizationInfo oq_info = dst->quantization_info().uniform(); @@ -483,16 +531,16 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights int min_activation = 0; int max_activation = 0; - const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { ActivationLayerInfo::ActivationFunction::RELU, - ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, - ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU - }; + const std::set<ActivationLayerInfo::ActivationFunction> supported_acts = { + ActivationLayerInfo::ActivationFunction::RELU, ActivationLayerInfo::ActivationFunction::BOUNDED_RELU, + ActivationLayerInfo::ActivationFunction::LU_BOUNDED_RELU}; - if(conv2d_info.act_info.enabled()) + if (conv2d_info.act_info.enabled()) { - if(supported_acts.count(conv2d_info.act_info.activation()) != 0) + if (supported_acts.count(conv2d_info.act_info.activation()) != 0) { - std::tie(min_activation, max_activation) = get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); + std::tie(min_activation, max_activation) = + get_quantized_activation_min_max(conv2d_info.act_info, data_type, output_quant_info); } else { @@ -509,16 +557,18 @@ Status ClGemmConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights // In case of NHWC, we need to run GEMM3D (gemm_3d_depth != 0) in order to avoid reshaping the output matrix const unsigned int gemm_3d_depth = (data_layout == DataLayout::NHWC) ? conv_h : 0; - ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); + ARM_COMPUTE_RETURN_ON_ERROR(validate_mm(gemm_input_to_use, weights_to_use, biases_to_use, gemm_output_to_use, + gemmlowp_output_stage, gemm_3d_depth, skip_im2col, conv2d_info.act_info)); // Validate Col2Im - if(!skip_col2im) + if (!skip_col2im) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClCol2ImKernel::validate(gemm_output_to_use, dst, Size2D(conv_w, conv_h), conv2d_info.num_groups)); } // Validate Activation Layer - if(!fuse_activation) + if (!fuse_activation) { ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClActivationKernel::validate(dst, nullptr, conv2d_info.act_info)); } @@ -541,30 +591,26 @@ void ClGemmConv2d::run(ITensorPack &tensors) CLAuxTensorHandler weights_reshaped(offset_int_vec(WeightsReshaped), _weights_reshaped, tensors, false); // Run im2col - if(!_skip_im2col) + if (!_skip_im2col) { - ITensorPack pack = - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, im2col_output.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, src}, {TensorType::ACL_DST, im2col_output.get()}}; CLScheduler::get().enqueue_op(*_im2col_kernel, pack, false); gemm_input_to_use = im2col_output.get(); } - if(!_skip_col2im) + if (!_skip_col2im) { gemm_output_to_use = gemm_output.get(); } ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, gemm_input_to_use); pack_mm.add_const_tensor(TensorType::ACL_SRC_1, weights_reshaped.get()); - if(!_append_bias) + if (!_append_bias) { pack_mm.add_const_tensor(TensorType::ACL_SRC_2, biases); } pack_mm.add_tensor(TensorType::ACL_DST, gemm_output_to_use); // Runs ClGemm or ClGemmLowpMatrixMultiplyCore functions - if(_is_quantized) + if (_is_quantized) { // Run gemmlowp _mm_gemmlowp->run(pack_mm); @@ -576,43 +622,32 @@ void ClGemmConv2d::run(ITensorPack &tensors) } // Reshape output matrix - if(!_skip_col2im) + if (!_skip_col2im) { - ITensorPack pack = - { - { TensorType::ACL_SRC, gemm_output_to_use }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, gemm_output_to_use}, {TensorType::ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_col2im_kernel.get(), pack, false); } //Run Activation Layer if we cannot fuse in GEMM - if(!_fuse_activation) + if (!_fuse_activation) { - ITensorPack pack = - { - { TensorType::ACL_SRC, dst }, - { TensorType::ACL_DST, dst } - }; + ITensorPack pack = {{TensorType::ACL_SRC, dst}, {TensorType::ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_activation_kernel.get(), pack, false); } } void ClGemmConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { // Run weights reshaping and mark original weights tensor as unused - ICLTensor *weights_reshaped_p = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped))); + ICLTensor *weights_reshaped_p = + utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(WeightsReshaped))); CLAuxTensorHandler weights_reshaped(_weights_reshaped, *weights_reshaped_p); auto weights = tensors.get_const_tensor(TensorType::ACL_SRC_1); - ITensorPack pack = - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, weights_reshaped.get() } - }; + ITensorPack pack = {{TensorType::ACL_SRC, weights}, {TensorType::ACL_DST, weights_reshaped.get()}}; - if(_append_bias) + if (_append_bias) { const auto biases = tensors.get_const_tensor(TensorType::ACL_SRC_2); pack.add_const_tensor(TensorType::ACL_BIAS, biases); diff --git a/src/gpu/cl/operators/ClGemmConv2d.h b/src/gpu/cl/operators/ClGemmConv2d.h index 8a46ee2dc3..e8f3147ac3 100644 --- a/src/gpu/cl/operators/ClGemmConv2d.h +++ b/src/gpu/cl/operators/ClGemmConv2d.h @@ -27,6 +27,7 @@ #include "arm_compute/core/TensorInfo.h" #include "arm_compute/core/Types.h" #include "arm_compute/runtime/FunctionDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -100,15 +101,24 @@ public: * @param[in] weights_info Specifies if the weights tensor has been reshaped with CLWeightsReshapeKernel. If this is not part of the fully connected layer the weights * tensor has also been transposed with CLGEMMReshapeRHSMatrixKernel. Data type supported: Same as @p input. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const Conv2dInfo &conv2d_info, - const WeightsInfo &weights_info = WeightsInfo()); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const Conv2dInfo &conv2d_info, + const WeightsInfo &weights_info = WeightsInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *output, const Conv2dInfo &conv2d_info, + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const Conv2dInfo &conv2d_info, const WeightsInfo &weights_info = WeightsInfo()); // Inherited methods overridden: @@ -130,9 +140,14 @@ private: * @param[in] gemm_3d_depth Depth of GEMM 3D * @param[in] act_info Activation to apply after the matrix multiplication */ - void configure_mm(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, + void configure_mm(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, const ActivationLayerInfo &act_info); + int gemm_3d_depth, + const ActivationLayerInfo &act_info); /** Static function to check if given info will lead to a valid configuration of @ref CLGEMMConvolutionLayer matrix multiply routines * * @param[in] src Input tensor info. Data types supported: QASYMM8/QASYMM8_SIGNED/F16/F32. @@ -148,8 +163,14 @@ private: * * @return a status */ - static Status validate_mm(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &gemmlowp_output_stage, - int gemm_3d_depth, bool skip_im2col, const ActivationLayerInfo &act_info); + static Status validate_mm(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &gemmlowp_output_stage, + int gemm_3d_depth, + bool skip_im2col, + const ActivationLayerInfo &act_info); enum AuxTensorIdx { diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp index 2622274587..71c247de79 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.cpp @@ -52,7 +52,7 @@ namespace { inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) { - switch(kernel_type) + switch (kernel_type) { case CLGEMMKernelType::NATIVE: case CLGEMMKernelType::RESHAPED_ONLY_RHS: @@ -71,32 +71,41 @@ inline bool validate_gemm_kernel(CLGEMMKernelType kernel_type) inline CLGEMMKernelType auto_select_gemm_kernel(auto_heuristics::CommonQuery query, bool reshape_b_only_on_first_run) { auto gemm_kernel = auto_heuristics::select_mlgo_gemm_kernel(query, reshape_b_only_on_first_run); - if(bool(gemm_kernel)) + if (bool(gemm_kernel)) { - if(validate_gemm_kernel(gemm_kernel.gemm_type)) + if (validate_gemm_kernel(gemm_kernel.gemm_type)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from mlgo heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } } gemm_kernel = auto_heuristics::select_default_gemm_kernel(query, reshape_b_only_on_first_run); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", to_string(gemm_kernel.gemm_type).c_str()); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use gemm kernel from default heuristics: %s.", + to_string(gemm_kernel.gemm_type).c_str()); return gemm_kernel.gemm_type; } // Validate lhs_info and rhs_info for native kernel -inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo mm_result_s32_info{}; // Output tensor auto initialization if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty( + mm_result_s32_info, + a->clone()->set_tensor_shape(compute_mm_shape(*a, *b, false, reshape_info)).set_data_type(DataType::S32)); // Validate mm kernel // NOTE: Ignore all other parameters (eg. output stage etc.) and only validate lhs and rhs info // NOTE: This assumes: // 1. lhs and rhs info's validity does not depend on these other parameters and vice versa(in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_arguments). // 2. lhs and rhs info does not cause window and padding issues through side effects (in CLGEMMLowpMatrixMultiplyNativeKernel.cpp validate_and_configure_window). - if(!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, reshape_info))) + if (!bool(ClGemmLowpMatrixMultiplyNativeKernel::validate(a, b, &mm_result_s32_info, lhs_info, rhs_info, + reshape_info))) { return false; } @@ -104,31 +113,45 @@ inline bool validate_lhs_rhs_info_native(const GEMMLHSMatrixInfo &lhs_info, cons } // Automatically select between mlgo (prioritized) and default heuristics for native kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, const ITensorInfo *a, const ITensorInfo *b, const GEMMReshapeInfo &reshape_info) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_native(auto_heuristics::CommonQuery query, + const ITensorInfo *a, + const ITensorInfo *b, + const GEMMReshapeInfo &reshape_info) { auto config = auto_heuristics::select_mlgo_gemm_config_native(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) + if (validate_lhs_rhs_info_native(config.lhs_info, config.rhs_info, a, b, reshape_info)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use native config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_native(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use native config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, - unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -148,7 +171,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) { return false; } @@ -156,14 +180,22 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs(const GEMMLHSMatrixInfo &lhs } // Validate lhs_info and rhs_info for reshaped only rhs kernel -inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, const GEMMRHSMatrixInfo &rhs_info, const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *output, - unsigned int m, unsigned int n, unsigned int k, bool reinterpret_input_as_3d, int depth_output_gemm3d) +inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo &lhs_info, + const GEMMRHSMatrixInfo &rhs_info, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output, + unsigned int m, + unsigned int n, + unsigned int k, + bool reinterpret_input_as_3d, + int depth_output_gemm3d) { // Validate GEMMLHSMatrixInfo and GEMMRHSMatrixInfo for reshaped only rhs kernel TensorInfo tmp_b_info{}; // Validate reshape RHS kernel auto_init_if_empty(tmp_b_info, b->clone()->set_tensor_shape(compute_rhs_reshaped_shape(*b, rhs_info))); - if(!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) + if (!bool(ClGemmReshapeRhsMatrixKernel::validate(b, &tmp_b_info, rhs_info))) { return false; } @@ -183,7 +215,8 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo // Since we ignore the output stage, output data type has to be S32 to pass the validation TensorInfo output_info_copy(*output); output_info_copy.set_data_type(DataType::S32); - if(!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, gemm_kernel_info))) + if (!bool(ClGemmLowpMatrixMultiplyReshapedOnlyRhsMMULKernel::validate(a, &tmp_b_info, &output_info_copy, + gemm_kernel_info))) { return false; } @@ -191,40 +224,55 @@ inline bool validate_lhs_rhs_info_reshaped_only_rhs_mmul(const GEMMLHSMatrixInfo } // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, - const ITensorInfo *a, - const ITensorInfo *b, const ITensorInfo *output) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) { auto config = auto_heuristics::select_mlgo_gemm_config_reshaped_only_rhs(query); - if(config) + if (config) { - if(validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d)) + if (validate_lhs_rhs_info_reshaped_only_rhs(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d)) { - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from mlgo heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } } config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } // Automatically select between mlgo (prioritized) and default heuristics for reshaped only rhs kernel configs -std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, bool reinterpret_input_as_3d, int depth_output_gemm3d, - const ITensorInfo *a, - const ITensorInfo *b, const ITensorInfo *output) +std::pair<GEMMLHSMatrixInfo, GEMMRHSMatrixInfo> +auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery query, + bool reinterpret_input_as_3d, + int depth_output_gemm3d, + const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *output) { ARM_COMPUTE_UNUSED(a, b, output, reinterpret_input_as_3d, depth_output_gemm3d); auto config = auto_heuristics::select_default_gemm_config_reshaped_only_rhs(query); - validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, query.k, reinterpret_input_as_3d, depth_output_gemm3d); - ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE("Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", to_string(config.lhs_info).c_str(), - to_string(config.rhs_info).c_str()); - return { config.lhs_info, config.rhs_info }; + validate_lhs_rhs_info_reshaped_only_rhs_mmul(config.lhs_info, config.rhs_info, a, b, output, query.m, query.n, + query.k, reinterpret_input_as_3d, depth_output_gemm3d); + ARM_COMPUTE_LOG_INFO_MSG_WITH_FORMAT_CORE( + "Use reshaped_only_rhs_mmul config from default heuristics: LHS info: %s ; RHS info: %s ", + to_string(config.lhs_info).c_str(), to_string(config.rhs_info).c_str()); + return {config.lhs_info, config.rhs_info}; } inline bool is_gemm_reshaped(CLGEMMKernelType kernel_type) { - switch(kernel_type) + switch (kernel_type) { case CLGEMMKernelType::NATIVE: return false; @@ -254,8 +302,11 @@ ClGemmLowpMatrixMultiplyCore::ClGemmLowpMatrixMultiplyCore() ClGemmLowpMatrixMultiplyCore::~ClGemmLowpMatrixMultiplyCore() = default; void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_context, - ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, - const GEMMInfo &gemm_info) + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_ERROR_THROW_ON(ClGemmLowpMatrixMultiplyCore::validate(a, b, c, output, gemm_info)); @@ -263,8 +314,8 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con _reshape_b_only_on_first_run = gemm_info.reshape_b_only_on_first_run(); _a_offset = a->quantization_info().uniform().offset; - _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && a->data_type() == DataType::QASYMM8; + _convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && a->data_type() == DataType::QASYMM8; _b_offset = _convert_to_qasymm8 ? -128 : b->quantization_info().uniform().offset; _gemm_info = gemm_info; @@ -282,17 +333,18 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con // Arguments used by GEMMReshapeInfo // in order to know how the matrices have been reshaped bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); const auto reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - _gemm_kernel_type = auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, _reshape_b_only_on_first_run); + _gemm_kernel_type = auto_select_gemm_kernel( + auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, _reshape_b_only_on_first_run); - if(_convert_to_qasymm8) + if (_convert_to_qasymm8) { // Set data type for converted weights _qasymm8_weights = *b; @@ -301,47 +353,50 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con } ITensorInfo *matrix_b = _convert_to_qasymm8 ? &_qasymm8_weights : b; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { matrix_b = &_tmp_b; // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, - depth_output_gemm3d, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); // Configure reshape RHS kernel - _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { matrix_b = &_tmp_b; // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, reinterpret_input_as_3d, - depth_output_gemm3d, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_reshaped_only_rhs_mmul( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, reinterpret_input_as_3d, + depth_output_gemm3d, a, _convert_to_qasymm8 ? &_qasymm8_weights : b, output); // Configure reshape RHS kernel - _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, rhs_info); + _mtx_b_reshape_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_tmp_b, + rhs_info); } // Using default reduction info - const GEMMLowpReductionKernelInfo reduction_info {}; + const GEMMLowpReductionKernelInfo reduction_info{}; // Initialize matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0) + if (_a_offset != 0) { _vector_sum_col = TensorInfo(compute_reductionA_shape(*b), 1, DataType::S32); // Configure Matrix B reduction kernel - _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, &_vector_sum_col, reduction_info); + _mtx_b_reduction_kernel->configure(compile_context, _convert_to_qasymm8 ? &_qasymm8_weights : b, + &_vector_sum_col, reduction_info); } // Initialize Matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { _vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); @@ -360,17 +415,19 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con gemm_kernel_info.a_offset = _a_offset; gemm_kernel_info.b_offset = _b_offset; // If GEMMLowpOutputStage != NONE, fuse the offset contribution with the output stage - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { // Configure offset contribution kernel - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; _gemm_output_stage_multipliers = TensorInfo(TensorShape(num_filters), 1, DataType::S32); _gemm_output_stage_shifts = TensorInfo(TensorShape(num_filters), 1, DataType::S32); GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); gemmlowp_output_stage.output_data_type = a->data_type(); - if(num_filters == 1) + if (num_filters == 1) { // Per-channel quantization with OFM == 1 is equivalent to uniform quantization. // Setting this flag to false prevents the kernel from adding useless padding to the output multipliers and shifts @@ -379,55 +436,67 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL && + gemmlowp_output_stage.type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { // Configure and tune matrix multiply kernel with fused output stage - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, - _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_reshaped_only_rhs_mmul_kernel->configure( + compile_context, a, matrix_b, output, gemm_kernel_info, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); } else { _run_output_stage = true; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { - _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { - _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, gemm_kernel_info); + _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, + gemm_kernel_info); } else { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - a, _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : matrix_b, reshape_info); // Configure matrix multiply kernel - _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, reshape_info); - - _offset_contribution_output_stage_kernel->configure(compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, - c != nullptr ? c : nullptr, output, a->dimension(0), _a_offset, _b_offset, gemmlowp_output_stage, - &_gemm_output_stage_multipliers, &_gemm_output_stage_shifts); + _mm_native_kernel->configure(compile_context, a, matrix_b, &_mm_result_s32, lhs_info, rhs_info, + reshape_info); + + _offset_contribution_output_stage_kernel->configure( + compile_context, &_mm_result_s32, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, output, a->dimension(0), + _a_offset, _b_offset, gemmlowp_output_stage, &_gemm_output_stage_multipliers, + &_gemm_output_stage_shifts); } } } else { _run_offset_contribution = true; - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { // Configure and tune matrix multiply kernel _mm_reshaped_only_rhs_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { // Configure and tune matrix multiply kernel _mm_reshaped_only_rhs_mmul_kernel->configure(compile_context, a, matrix_b, output, gemm_kernel_info); @@ -436,44 +505,65 @@ void ClGemmLowpMatrixMultiplyCore::configure(const CLCompileContext &compile_con { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }, - a, _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); + std::tie(lhs_info, rhs_info) = auto_select_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}, a, + _convert_to_qasymm8 ? &_qasymm8_weights : b, reshape_info); // Configure matrix multiply kernel _mm_native_kernel->configure(compile_context, a, matrix_b, output, lhs_info, rhs_info, reshape_info); } // Configure offset contribution kernel - _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, _b_offset == 0 ? nullptr : &_vector_sum_row, - c != nullptr ? c : nullptr, a->dimension(0), _a_offset, _b_offset); + _offset_contribution_kernel->configure(compile_context, output, _a_offset == 0 ? nullptr : &_vector_sum_col, + _b_offset == 0 ? nullptr : &_vector_sum_row, c != nullptr ? c : nullptr, + a->dimension(0), _a_offset, _b_offset); } // Request memory - _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); - if(is_gemm_reshaped(_gemm_kernel_type)) + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + if (is_gemm_reshaped(_gemm_kernel_type)) { // Overwrite Rhs as prepare if gemm is reshaped as there will be a two-step transformation - _aux_mem[RhsQAsymm8] = MemoryInfo(offset_int_vec(RhsQAsymm8), _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, _qasymm8_weights.total_size()); - _aux_mem[RhsReshape] = MemoryInfo(offset_int_vec(RhsReshape), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); - } - if(_a_offset != 0) - { - _aux_mem[VecSumCol] = MemoryInfo(offset_int_vec(VecSumCol), _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _vector_sum_col.total_size()); - } - if(_b_offset != 0) - { - _aux_mem[VecSumRow] = MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); - } - _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); - _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, _gemm_output_stage_multipliers.total_size()); - _aux_mem[Shifts] = MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); + _aux_mem[RhsQAsymm8] = + MemoryInfo(offset_int_vec(RhsQAsymm8), + _reshape_b_only_on_first_run ? MemoryLifetime::Prepare : MemoryLifetime::Temporary, + _qasymm8_weights.total_size()); + _aux_mem[RhsReshape] = MemoryInfo( + offset_int_vec(RhsReshape), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, _tmp_b.total_size()); + } + if (_a_offset != 0) + { + _aux_mem[VecSumCol] = + MemoryInfo(offset_int_vec(VecSumCol), + _reshape_b_only_on_first_run ? MemoryLifetime::Persistent : MemoryLifetime::Temporary, + _vector_sum_col.total_size()); + } + if (_b_offset != 0) + { + _aux_mem[VecSumRow] = + MemoryInfo(offset_int_vec(VecSumRow), MemoryLifetime::Temporary, _vector_sum_row.total_size()); + } + _aux_mem[ResultS32] = MemoryInfo(offset_int_vec(ResultS32), MemoryLifetime::Temporary, _mm_result_s32.total_size()); + _aux_mem[Multipliers] = MemoryInfo(offset_int_vec(Multipliers), MemoryLifetime::Persistent, + _gemm_output_stage_multipliers.total_size()); + _aux_mem[Shifts] = + MemoryInfo(offset_int_vec(Shifts), MemoryLifetime::Persistent, _gemm_output_stage_shifts.total_size()); } -Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info) +Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(a, b, output); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(a, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(b, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM8, DataType::QSYMM8_PER_CHANNEL); ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8 && b->data_type() == DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON(a->data_type() == DataType::QASYMM8_SIGNED && b->data_type() == DataType::QASYMM8); ARM_COMPUTE_RETURN_ERROR_ON_MSG(gemm_info.is_a_reshaped(), "Matrix A already reshaped is not supported"); @@ -492,39 +582,44 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const GPUTarget gpu_target = CLScheduler::get().target(); bool reinterpret_input_as_3d = gemm_info.reinterpret_input_as_3d(); - const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); - const unsigned int n = b->dimension(0); - const unsigned int k = a->dimension(0); - const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); - const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); + const unsigned int m = reinterpret_input_as_3d ? (a->dimension(1) * a->dimension(2)) : a->dimension(1); + const unsigned int n = b->dimension(0); + const unsigned int k = a->dimension(0); + const unsigned int batch_size = reinterpret_input_as_3d ? a->dimension(3) : a->dimension(2); + const int depth_output_gemm3d = gemm_info.depth_output_gemm3d(); - bool reshape_matrix_b = is_gemm_reshaped(auto_select_gemm_kernel(auto_heuristics::CommonQuery{ gpu_target, a->data_type(), m, n, k, batch_size }, gemm_info.reshape_b_only_on_first_run())); + bool reshape_matrix_b = is_gemm_reshaped( + auto_select_gemm_kernel(auto_heuristics::CommonQuery{gpu_target, a->data_type(), m, n, k, batch_size}, + gemm_info.reshape_b_only_on_first_run())); const GEMMReshapeInfo reshape_info = GEMMReshapeInfo(m, n, k, 1, 1, depth_output_gemm3d, reinterpret_input_as_3d); - bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && is_data_type_quantized_symmetric(b->data_type()) - && is_data_type_quantized_asymmetric(a->data_type()); + bool convert_to_qasymm8 = is_data_type_quantized_per_channel(b->data_type()) && + is_data_type_quantized_symmetric(b->data_type()) && + is_data_type_quantized_asymmetric(a->data_type()); TensorInfo weights_info(*b); - if(convert_to_qasymm8) + if (convert_to_qasymm8) { b_offset = -128; weights_info.set_data_type(DataType::QASYMM8); ARM_COMPUTE_RETURN_ON_ERROR(ClCastKernel::validate(b, &weights_info, ConvertPolicy::WRAP)); } const ITensorInfo *matrix_b_info = &weights_info; - if(reshape_matrix_b) + if (reshape_matrix_b) { matrix_b_info = &tmp_b_info; // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_reshaped_only_rhs(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_reshaped_only_rhs( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate reshape RHS kernel - auto_init_if_empty(tmp_b_info, weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); + auto_init_if_empty(tmp_b_info, + weights_info.clone()->set_tensor_shape(compute_rhs_reshaped_shape(weights_info, rhs_info))); ARM_COMPUTE_RETURN_ON_ERROR(ClGemmReshapeRhsMatrixKernel::validate(&weights_info, &tmp_b_info, rhs_info)); } @@ -533,21 +628,23 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso const GEMMLowpReductionKernelInfo reduction_info; // Validate matrix B reduction kernel only if _a_offset is not equal to 0 - if(a_offset != 0) + if (a_offset != 0) { info_vector_sum_col = TensorInfo(compute_reductionA_shape(weights_info), 1, DataType::S32); // Configure Matrix B reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixBReductionKernel::validate(&weights_info, &info_vector_sum_col, reduction_info)); } // Validate Matrix A reduction kernel only if _b_offset is not equal to 0 - if(b_offset != 0) + if (b_offset != 0) { info_vector_sum_row = TensorInfo(compute_reductionB_shape(*a), 1, DataType::S32); // Configure matrix A reduction kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemmLowpMatrixAReductionKernel::validate(a, &info_vector_sum_row, reduction_info)); } GEMMKernelInfo gemm_kernel_info; @@ -560,92 +657,99 @@ Status ClGemmLowpMatrixMultiplyCore::validate(const ITensorInfo *a, const ITenso gemm_kernel_info.rhs_info = rhs_info; gemm_kernel_info.a_offset = a_offset; gemm_kernel_info.b_offset = b_offset; - if(gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) + if (gemm_info.gemmlowp_output_stage().type != GEMMLowpOutputStageType::NONE) { - const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; - const TensorInfo gemm_output_stage_multipliers_shifts_info(TensorInfo(TensorShape(num_filters), 1, DataType::S32)); + const TensorInfo gemm_output_stage_multipliers_shifts_info( + TensorInfo(TensorShape(num_filters), 1, DataType::S32)); GEMMLowpOutputStageInfo gemmlowp_output_stage = gemm_info.gemmlowp_output_stage(); gemmlowp_output_stage.output_data_type = a->data_type(); gemm_kernel_info.output_stage = gemmlowp_output_stage; - if(reshape_matrix_b && gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) + if (reshape_matrix_b && + gemm_info.gemmlowp_output_stage().type == GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT) { - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, &gemm_output_stage_multipliers_shifts_info, + &gemm_output_stage_multipliers_shifts_info)); } else { TensorInfo mm_result_s32_info{}; - if(reshape_matrix_b) + if (reshape_matrix_b) { // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, reshape_info)) + .set_data_type(DataType::S32)); // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, gemm_kernel_info)); } else { // Output tensor auto inizialitation if not yet initialized - auto_init_if_empty(mm_result_s32_info, a->clone()->set_tensor_shape(compute_mm_shape(*matrix_a_info, *matrix_b_info, false, reshape_info)).set_data_type(DataType::S32)); + auto_init_if_empty(mm_result_s32_info, a->clone() + ->set_tensor_shape(compute_mm_shape( + *matrix_a_info, *matrix_b_info, false, reshape_info)) + .set_data_type(DataType::S32)); // Pick up the GEMM configuration // NOTE: No need to validate mlgo configurations as they automatically fall back to default heuristics if validation fails // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, &mm_result_s32_info, lhs_info, rhs_info, reshape_info)); } // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate(&mm_result_s32_info, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - output, - a_offset, b_offset, - gemmlowp_output_stage, - &gemm_output_stage_multipliers_shifts_info, - &gemm_output_stage_multipliers_shifts_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionOutputStageKernel::validate( + &mm_result_s32_info, a_offset == 0 ? nullptr : &info_vector_sum_col, + b_offset == 0 ? nullptr : &info_vector_sum_row, c, output, a_offset, b_offset, gemmlowp_output_stage, + &gemm_output_stage_multipliers_shifts_info, &gemm_output_stage_multipliers_shifts_info)); } } else { - if(reshape_matrix_b) + if (reshape_matrix_b) { // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate(matrix_a_info, matrix_b_info, output, gemm_kernel_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyReshapedOnlyRhsKernel::validate( + matrix_a_info, matrix_b_info, output, gemm_kernel_info)); } else { // Pick up the GEMM configuration // It doesn't matter whether Datatype is DataType::QASYMM8 or DataType::QASYMM8_SIGNED, since it only affect the shape configuration - const auto res = select_default_gemm_config_native(auto_heuristics::CommonQuery{ gpu_target, DataType::QASYMM8, m, n, k, batch_size }); - lhs_info = res.lhs_info; - rhs_info = res.rhs_info; + const auto res = select_default_gemm_config_native( + auto_heuristics::CommonQuery{gpu_target, DataType::QASYMM8, m, n, k, batch_size}); + lhs_info = res.lhs_info; + rhs_info = res.rhs_info; // Validate matrix multiply - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate(matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpMatrixMultiplyNativeKernel::validate( + matrix_a_info, matrix_b_info, output, lhs_info, rhs_info, reshape_info)); } - if(output->total_size() != 0) + if (output->total_size() != 0) { // Validate offset contribution kernel - ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate(output, - a_offset == 0 ? nullptr : &info_vector_sum_col, - b_offset == 0 ? nullptr : &info_vector_sum_row, - c, - a_offset, b_offset)); + ARM_COMPUTE_RETURN_ON_ERROR(ClGemmLowpOffsetContributionKernel::validate( + output, a_offset == 0 ? nullptr : &info_vector_sum_col, b_offset == 0 ? nullptr : &info_vector_sum_row, + c, a_offset, b_offset)); } } @@ -675,73 +779,61 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) const ITensor *matrix_a = a; const ITensor *matrix_b = _convert_to_qasymm8 ? rhs_qasymm8.get() : b; - if(is_gemm_reshaped(_gemm_kernel_type)) + if (is_gemm_reshaped(_gemm_kernel_type)) { matrix_b = tmp_b.get(); - if(!_reshape_b_only_on_first_run) + if (!_reshape_b_only_on_first_run) { // Run reshape matrix B - ITensorPack mtx_b_reshape_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack mtx_b_reshape_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_reshape_pack, false); } } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && !_reshape_b_only_on_first_run) + if (_a_offset != 0 && !_reshape_b_only_on_first_run) { - ITensorPack mtx_b_red_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, vec_sum_col.get() } - }; + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Run matrix A reduction kernel only if _b_offset is not equal to 0 - if(_b_offset != 0) + if (_b_offset != 0) { - ITensorPack mtx_a_red_pack = - { - { TensorType::ACL_SRC, matrix_a }, - { TensorType::ACL_DST, vec_sum_row.get() } - }; + ITensorPack mtx_a_red_pack = {{TensorType::ACL_SRC, matrix_a}, {TensorType::ACL_DST, vec_sum_row.get()}}; CLScheduler::get().enqueue_op(*_mtx_a_reduction_kernel, mtx_a_red_pack, false); } // Run matrix multiply - if(is_gemm_reshaped(_gemm_kernel_type)) + if (is_gemm_reshaped(_gemm_kernel_type)) { ITensorPack gemm_reshaped_pack; - if(_run_offset_contribution) + if (_run_offset_contribution) { - gemm_reshaped_pack = ITensorPack({ { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_DST, _run_output_stage ? res32.get() : dst } - }); + gemm_reshaped_pack = ITensorPack({{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_output_stage ? res32.get() : dst}}); } else { - gemm_reshaped_pack = ITensorPack( - { - { TensorType::ACL_SRC, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, - { TensorType::ACL_SHIFTS, shifts.get() }, - { TensorType::ACL_MULTIPLIERS, multipliers.get() }, - { TensorType::ACL_DST, dst }, + gemm_reshaped_pack = ITensorPack({ + {TensorType::ACL_SRC, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, }); } - if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) + if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS) { CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_kernel, gemm_reshaped_pack, false); } - else if(_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) + else if (_gemm_kernel_type == CLGEMMKernelType::RESHAPED_ONLY_RHS_MMUL) { CLScheduler::get().enqueue_op(*_mm_reshaped_only_rhs_mmul_kernel, gemm_reshaped_pack, false); } @@ -752,46 +844,39 @@ void ClGemmLowpMatrixMultiplyCore::run(ITensorPack &tensors) } else { - ITensorPack gemm_native_pack = - { - { TensorType::ACL_SRC_0, matrix_a }, - { TensorType::ACL_SRC_1, matrix_b }, - { TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get() } - }; + ITensorPack gemm_native_pack = {{TensorType::ACL_SRC_0, matrix_a}, + {TensorType::ACL_SRC_1, matrix_b}, + {TensorType::ACL_DST, _run_offset_contribution ? dst : res32.get()}}; CLScheduler::get().enqueue_op(*_mm_native_kernel, gemm_native_pack, false); } - if(_run_output_stage) + if (_run_output_stage) { // Run offset contribution/output stage kernel - ITensorPack output_stage_pack = - { - { TensorType::ACL_SRC, res32.get() }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() }, - { TensorType::ACL_SHIFTS, shifts.get() }, - { TensorType::ACL_MULTIPLIERS, multipliers.get() }, - { TensorType::ACL_DST, dst }, + ITensorPack output_stage_pack = { + {TensorType::ACL_SRC, res32.get()}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}, + {TensorType::ACL_SHIFTS, shifts.get()}, + {TensorType::ACL_MULTIPLIERS, multipliers.get()}, + {TensorType::ACL_DST, dst}, }; CLScheduler::get().enqueue_op(*_offset_contribution_output_stage_kernel, output_stage_pack, true); } - if(_run_offset_contribution) + if (_run_offset_contribution) { // Run offset contribution kernel - ITensorPack offset_contrib_pack = - { - { TensorType::ACL_SRC_DST, dst }, - { TensorType::ACL_BIAS, c }, - { TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get() }, - { TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get() } - }; + ITensorPack offset_contrib_pack = {{TensorType::ACL_SRC_DST, dst}, + {TensorType::ACL_BIAS, c}, + {TensorType::ACL_VEC_ROW_SUM, _b_offset == 0 ? nullptr : vec_sum_row.get()}, + {TensorType::ACL_VEC_COL_SUM, _a_offset == 0 ? nullptr : vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_offset_contribution_kernel, offset_contrib_pack, true); } } void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { auto b = tensors.get_const_tensor(TensorType::ACL_SRC_1); CLAuxTensorHandler tmp_b(offset_int_vec(RhsReshape), _tmp_b, tensors, true); @@ -800,56 +885,55 @@ void ClGemmLowpMatrixMultiplyCore::prepare(ITensorPack &tensors) ARM_COMPUTE_ERROR_ON_NULLPTR(b); - if(_convert_to_qasymm8) + if (_convert_to_qasymm8) { - ITensorPack convert_to_qs8_pack = { { ACL_SRC, b }, { ACL_DST, rhs_qasymm8.get() } }; + ITensorPack convert_to_qs8_pack = {{ACL_SRC, b}, {ACL_DST, rhs_qasymm8.get()}}; CLScheduler::get().enqueue_op(*_weights_to_qasymm8, convert_to_qs8_pack, false); b->mark_as_unused(); } - if(is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) + if (is_gemm_reshaped(_gemm_kernel_type) && _reshape_b_only_on_first_run) { // Run reshape kernel and mark original weights tensor as unused - ITensorPack mtx_b_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, tmp_b.get() } - }; + ITensorPack mtx_b_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, tmp_b.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reshape_kernel, mtx_b_pack, false); b->mark_as_unused(); } // Run matrix B reduction kernel only if _a_offset is not equal to 0 - if(_a_offset != 0 && _reshape_b_only_on_first_run) + if (_a_offset != 0 && _reshape_b_only_on_first_run) { - ITensorPack mtx_b_red_pack = - { - { TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b }, - { TensorType::ACL_DST, vec_sum_col.get() } - }; + ITensorPack mtx_b_red_pack = {{TensorType::ACL_SRC, _convert_to_qasymm8 ? rhs_qasymm8.get() : b}, + {TensorType::ACL_DST, vec_sum_col.get()}}; CLScheduler::get().enqueue_op(*_mtx_b_reduction_kernel, mtx_b_red_pack, false); } // Compute GEMM output multipliers and shifts for output stage { - const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() : 1; + const size_t num_filters = (_gemm_info.gemmlowp_output_stage().is_quantized_per_channel) + ? _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.size() + : 1; CLAuxTensorHandler multipliers(offset_int_vec(Multipliers), _gemm_output_stage_multipliers, tensors, false); CLAuxTensorHandler shifts(offset_int_vec(Shifts), _gemm_output_stage_shifts, tensors, false); ICLTensor *multiplier_tensor = multipliers.get(); - if(multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) + if (multiplier_tensor != nullptr && multiplier_tensor->info()->total_size() > 0) { multiplier_tensor->map(CLScheduler::get().queue(), true); - std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), num_filters * sizeof(int32_t)); + std::memcpy(multiplier_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_multipliers.data(), + num_filters * sizeof(int32_t)); multiplier_tensor->unmap(CLScheduler::get().queue()); } ICLTensor *shifts_tensor = shifts.get(); - if(shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) + if (shifts.get() != nullptr && shifts_tensor->info()->total_size() > 0) { shifts_tensor->map(CLScheduler::get().queue(), true); - std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); + std::memcpy(shifts_tensor->ptr_to_element(Coordinates(0)), + _gemm_info.gemmlowp_output_stage().gemmlowp_shifts.data(), num_filters * sizeof(int32_t)); shifts_tensor->unmap(CLScheduler::get().queue()); } } diff --git a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h index 6e32a90fc4..c80dc3a182 100644 --- a/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h +++ b/src/gpu/cl/operators/ClGemmLowpMatrixMultiplyCore.h @@ -93,18 +93,27 @@ public: * @param[in] gemm_info (Optional) Specifies if the matrix A and/or matrix B have been reshaped and * if the reshape of matrix B should be executed only for the first run */ - void configure(const CLCompileContext &compile_context, ITensorInfo *a, ITensorInfo *b, ITensorInfo *c, ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *a, + ITensorInfo *b, + ITensorInfo *c, + ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmLowpMatrixMultiplyCore::configure() * * @return a status */ - static Status validate(const ITensorInfo *a, const ITensorInfo *b, const ITensorInfo *c, const ITensorInfo *output, const GEMMInfo &gemm_info = GEMMInfo()); + static Status validate(const ITensorInfo *a, + const ITensorInfo *b, + const ITensorInfo *c, + const ITensorInfo *output, + const GEMMInfo &gemm_info = GEMMInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -130,7 +139,7 @@ private: std::unique_ptr<kernels::ClGemmLowpMatrixAReductionKernel> _mtx_a_reduction_kernel; std::unique_ptr<kernels::ClGemmLowpMatrixBReductionKernel> _mtx_b_reduction_kernel; std::unique_ptr<kernels::ClGemmLowpOffsetContributionKernel> _offset_contribution_kernel; - std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; + std::unique_ptr<kernels::ClGemmLowpOffsetContributionOutputStageKernel> _offset_contribution_output_stage_kernel; // Temporary tensors TensorInfo _qasymm8_weights{}; @@ -141,13 +150,13 @@ private: TensorInfo _gemm_output_stage_multipliers{}; TensorInfo _gemm_output_stage_shifts{}; - int32_t _a_offset{ 0 }; - int32_t _b_offset{ 0 }; - bool _reshape_b_only_on_first_run{ false }; - bool _run_output_stage{ false }; - bool _convert_to_qasymm8{ false }; - bool _run_offset_contribution{ false }; - bool _is_prepared{ false }; + int32_t _a_offset{0}; + int32_t _b_offset{0}; + bool _reshape_b_only_on_first_run{false}; + bool _run_output_stage{false}; + bool _convert_to_qasymm8{false}; + bool _run_offset_contribution{false}; + bool _is_prepared{false}; GEMMInfo _gemm_info{}; CLGEMMKernelType _gemm_kernel_type{}; diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp index a61b11a3b1..e3363e3685 100644 --- a/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.cpp @@ -27,22 +27,25 @@ #include "arm_compute/core/Types.h" #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleByFloatKernel.h" #include "src/gpu/cl/kernels/ClGemmLowpQuantizeDownInt32ScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src, dst); ARM_COMPUTE_LOG_PARAMS(src, bias, dst, info); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: { @@ -70,12 +73,16 @@ void ClGemmLowpOutputStage::configure(const CLCompileContext &compile_context, c } } -Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info) +Status ClGemmLowpOutputStage::validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(dst, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::QSYMM16); - switch(info.type) + switch (info.type) { case GEMMLowpOutputStageType::QUANTIZE_DOWN_FIXEDPOINT: return opencl::kernels::ClGemmLowpQuantizeDownInt32ScaleByFixedPointKernel::validate(src, bias, dst, &info); @@ -94,7 +101,7 @@ void ClGemmLowpOutputStage::run(ITensorPack &tensors) const ITensor *bias = tensors.get_const_tensor(ACL_BIAS); ITensor *dst = tensors.get_tensor(ACL_DST); - ITensorPack pack{ { ACL_SRC, src }, { ACL_BIAS, bias }, { ACL_DST, dst } }; + ITensorPack pack{{ACL_SRC, src}, {ACL_BIAS, bias}, {ACL_DST, dst}}; CLScheduler::get().enqueue_op(*_kernel, pack, true); } } // namespace opencl diff --git a/src/gpu/cl/operators/ClGemmLowpOutputStage.h b/src/gpu/cl/operators/ClGemmLowpOutputStage.h index 3f1b04dcce..6357e0200b 100644 --- a/src/gpu/cl/operators/ClGemmLowpOutputStage.h +++ b/src/gpu/cl/operators/ClGemmLowpOutputStage.h @@ -71,14 +71,21 @@ public: * @param[out] dst Destination tensor. Data type supported: QASYMM8/QASYMM8_SIGNED * @param[in] info GEMMLowp output stage metadata. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, const ITensorInfo *bias, ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + const ITensorInfo *bias, + ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClGemmLowpOutputStage::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *bias, const ITensorInfo *dst, const GEMMLowpOutputStageInfo &info); + static Status validate(const ITensorInfo *src, + const ITensorInfo *bias, + const ITensorInfo *dst, + const GEMMLowpOutputStageInfo &info); // Inherited methods overridden: void run(ITensorPack &tensors) override; diff --git a/src/gpu/cl/operators/ClIndirectConv2d.cpp b/src/gpu/cl/operators/ClIndirectConv2d.cpp index b900974574..777fc9e5e1 100644 --- a/src/gpu/cl/operators/ClIndirectConv2d.cpp +++ b/src/gpu/cl/operators/ClIndirectConv2d.cpp @@ -27,16 +27,15 @@ #include "arm_compute/core/Types.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" +#include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClIndirectConv2dAddressPrecalculationKernel.h" #include "src/gpu/cl/kernels/ClIndirectConv2dKernel.h" +#include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "src/runtime/heuristics/indirect_conv/ClIndirectConvKernelConfig.h" #include "src/runtime/heuristics/indirect_conv/IClIndirectConvKernelConfig.h" -#include "src/core/helpers/MemoryHelpers.h" -#include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" - using namespace arm_compute::cl_indirect_conv; namespace arm_compute @@ -47,7 +46,8 @@ using namespace arm_compute::experimental; namespace { -DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) +DirectConvComputeKernelInfo +config_indirect_convolution_nhwc(const ITensorInfo *src, const ITensorInfo *weights, const PadStrideInfo &conv_info) { // Get GPU target GPUTarget gpu_target = CLScheduler::get().target(); @@ -59,8 +59,13 @@ DirectConvComputeKernelInfo config_indirect_convolution_nhwc(const ITensorInfo * } // namespace -void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +void ClIndirectConv2d::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info); @@ -86,25 +91,29 @@ void ClIndirectConv2d::configure(const CLCompileContext &compile_context, ITenso CLScheduler::get().tune_kernel_static(*_indirect_conv_kernel); // Request memory for the indirect buffer - _aux_mem[IndirectBuffer] = MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size()); + _aux_mem[IndirectBuffer] = + MemoryInfo(offset_int_vec(IndirectBuffer), MemoryLifetime::Persistent, _indirect_buffer.total_size()); } -Status ClIndirectConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info) +Status ClIndirectConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info) { // Initialize the direct convolution descriptor const DirectConvComputeKernelInfo desc = config_indirect_convolution_nhwc(src, weights, conv_info); - TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape(src->tensor_shape(), - src->data_layout(), - weights->tensor_shape(), - conv_info, - desc); + TensorShape ind_buffer_shape = misc::shape_calculator::compute_indirect_buffer_shape( + src->tensor_shape(), src->data_layout(), weights->tensor_shape(), conv_info, desc); TensorInfo indirect_buffer(ind_buffer_shape, 1, DataType::S32); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate(src, weights, &indirect_buffer, conv_info, desc)); - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, conv_info, act_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dAddressPrecalculationKernel::validate( + src, weights, &indirect_buffer, conv_info, desc)); + ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClIndirectConv2dKernel::validate(src, weights, biases, &indirect_buffer, dst, + conv_info, act_info, desc)); return Status{}; } @@ -124,9 +133,10 @@ void ClIndirectConv2d::run(ITensorPack &tensors) void ClIndirectConv2d::prepare(ITensorPack &constants) { - if(!_is_prepared) + if (!_is_prepared) { - ICLTensor *indirect_buffer_aux = utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer))); + ICLTensor *indirect_buffer_aux = + utils::cast::polymorphic_downcast<ICLTensor *>(constants.get_tensor(offset_int_vec(IndirectBuffer))); ARM_COMPUTE_ERROR_ON(indirect_buffer_aux == nullptr); ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Preparing indirect buffer"); @@ -134,7 +144,7 @@ void ClIndirectConv2d::prepare(ITensorPack &constants) CLAuxTensorHandler indirect_buffer(_indirect_buffer, *indirect_buffer_aux); ARM_COMPUTE_ERROR_ON(indirect_buffer.get()->cl_buffer().get() == nullptr); - ITensorPack indirect_buffer_pack{ { ACL_DST, indirect_buffer.get() } }; + ITensorPack indirect_buffer_pack{{ACL_DST, indirect_buffer.get()}}; CLScheduler::get().enqueue_op(*_addr_precalculation_kernel, indirect_buffer_pack, true); _is_prepared = true; diff --git a/src/gpu/cl/operators/ClIndirectConv2d.h b/src/gpu/cl/operators/ClIndirectConv2d.h index e50fa25069..29e796efd9 100644 --- a/src/gpu/cl/operators/ClIndirectConv2d.h +++ b/src/gpu/cl/operators/ClIndirectConv2d.h @@ -77,7 +77,12 @@ public: * @param[in] act_info (Optional) Activation layer information in case of a fused activation. * */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -85,12 +90,16 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &constants) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &constants) override; experimental::MemoryRequirements workspace() const override; private: @@ -100,11 +109,11 @@ private: Count }; - std::unique_ptr<IClKernel> _indirect_conv_kernel{ nullptr }; - std::unique_ptr<IClKernel> _addr_precalculation_kernel{ nullptr }; + std::unique_ptr<IClKernel> _indirect_conv_kernel{nullptr}; + std::unique_ptr<IClKernel> _addr_precalculation_kernel{nullptr}; TensorInfo _indirect_buffer{}; - bool _is_prepared{ false }; - experimental::MemoryRequirements _aux_mem{ Count }; + bool _is_prepared{false}; + experimental::MemoryRequirements _aux_mem{Count}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClLogicalNot.cpp b/src/gpu/cl/operators/ClLogicalNot.cpp index b2eb89b320..d8d4186d00 100644 --- a/src/gpu/cl/operators/ClLogicalNot.cpp +++ b/src/gpu/cl/operators/ClLogicalNot.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClLogicalNot.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseUnaryKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl diff --git a/src/gpu/cl/operators/ClMatMul.cpp b/src/gpu/cl/operators/ClMatMul.cpp index 49d14127ca..c14b1f2992 100644 --- a/src/gpu/cl/operators/ClMatMul.cpp +++ b/src/gpu/cl/operators/ClMatMul.cpp @@ -47,11 +47,17 @@ ClMatMul::ClMatMul() { } -Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +Status ClMatMul::validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(lhs, rhs, dst); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(lhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(rhs, 1, DataType::QASYMM8, DataType::QASYMM8_SIGNED, + DataType::F16, DataType::F32); const GPUTarget gpu_target = CLScheduler::get().target(); @@ -61,11 +67,16 @@ Status ClMatMul::validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const const bool is_quantized = is_data_type_quantized_asymmetric(lhs->data_type()); - return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) : - ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + return is_quantized ? ClMatMulLowpNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info) + : ClMatMulNativeKernel::validate(lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); } -void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info) +void ClMatMul::configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(lhs, rhs, dst); ARM_COMPUTE_LOG_PARAMS(lhs, rhs, dst, matmul_info); @@ -81,12 +92,13 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l MatMulKernelInfo kernel_info = t->configure(lhs, rhs, matmul_info); - if(_is_quantized) + if (_is_quantized) { _matmul_lowp_native_kernel->set_target(gpu_target); // Configure the low-precision native matrix multiply kernel - _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, act_info); + _matmul_lowp_native_kernel->configure(compile_context, lhs, rhs, nullptr /* bias */, dst, kernel_info, + act_info); } else { @@ -99,7 +111,7 @@ void ClMatMul::configure(const CLCompileContext &compile_context, ITensorInfo *l void ClMatMul::run(ITensorPack &tensors) { - if(_is_quantized) + if (_is_quantized) { CLScheduler::get().enqueue_op(*_matmul_lowp_native_kernel, tensors, true); } diff --git a/src/gpu/cl/operators/ClMatMul.h b/src/gpu/cl/operators/ClMatMul.h index abbb75239a..64dcf217bd 100644 --- a/src/gpu/cl/operators/ClMatMul.h +++ b/src/gpu/cl/operators/ClMatMul.h @@ -26,6 +26,7 @@ #include "arm_compute/function_info/ActivationLayerInfo.h" #include "arm_compute/function_info/MatMulInfo.h" + #include "src/gpu/cl/IClOperator.h" #include "src/gpu/cl/kernels/ClMatMulLowpNativeKernel.h" #include "src/gpu/cl/kernels/ClMatMulNativeKernel.h" @@ -73,7 +74,11 @@ public: * @param[in] matmul_info Contains MatMul operation information described in @ref MatMulInfo. * @param[in] act_info Class containing information about fused activation function. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *lhs, ITensorInfo *rhs, ITensorInfo *dst, const MatMulInfo &matmul_info, + void configure(const CLCompileContext &compile_context, + ITensorInfo *lhs, + ITensorInfo *rhs, + ITensorInfo *dst, + const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -81,15 +86,19 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *lhs, const ITensorInfo *rhs, const ITensorInfo *dst, const MatMulInfo &matmul_info, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *lhs, + const ITensorInfo *rhs, + const ITensorInfo *dst, + const MatMulInfo &matmul_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); // Inherited methods overridden: void run(ITensorPack &tensors) override; private: - std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{ nullptr }; - std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{ nullptr }; + std::unique_ptr<kernels::ClMatMulNativeKernel> _matmul_native_kernel{nullptr}; + std::unique_ptr<kernels::ClMatMulLowpNativeKernel> _matmul_lowp_native_kernel{nullptr}; - bool _is_quantized{ false }; + bool _is_quantized{false}; }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMul.cpp b/src/gpu/cl/operators/ClMul.cpp index 2066f0cfaa..10cf8a6a38 100644 --- a/src/gpu/cl/operators/ClMul.cpp +++ b/src/gpu/cl/operators/ClMul.cpp @@ -24,17 +24,23 @@ #include "src/gpu/cl/operators/ClMul.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClMulKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClMulKernel.h" namespace arm_compute { namespace opencl { -void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +void ClMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); auto k = std::make_unique<kernels::ClMulKernel>(); @@ -42,22 +48,34 @@ void ClMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info) +Status ClMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info) { return kernels::ClMulKernel::validate(src1, src2, dst, scale, overflow_policy, rounding_policy, act_info); } -void ClComplexMul::configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info) +void ClComplexMul::configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info) { auto k = std::make_unique<kernels::ClComplexMulKernel>(); k->configure(compile_context, src1, src2, dst, act_info); _kernel = std::move(k); } -Status ClComplexMul::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info) +Status ClComplexMul::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info) { return kernels::ClComplexMulKernel::validate(src1, src2, dst, act_info); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClMul.h b/src/gpu/cl/operators/ClMul.h index 6086bc9d52..1cf4d68d4c 100644 --- a/src/gpu/cl/operators/ClMul.h +++ b/src/gpu/cl/operators/ClMul.h @@ -66,16 +66,27 @@ public: * @param[in] rounding_policy Rounding policy. Supported rounding modes: to zero, to nearest even. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, float scale, - ConvertPolicy overflow_policy, RoundingPolicy rounding_policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + float scale, + ConvertPolicy overflow_policy, + RoundingPolicy rounding_policy, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; /** Basic function to run @ref opencl::kernels::ClComplexMulKernel */ @@ -92,14 +103,21 @@ public: * @param[out] dst The dst tensor info, Data types supported: same as @p src1. Number of channels supported: same as @p src1. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + void configure(const CLCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * * Similar to @ref ClComplexMul::configure() * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, const ActivationLayerInfo &act_info = ActivationLayerInfo()); + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPRelu.cpp b/src/gpu/cl/operators/ClPRelu.cpp index cf4ebe6083..f3efd00bba 100644 --- a/src/gpu/cl/operators/ClPRelu.cpp +++ b/src/gpu/cl/operators/ClPRelu.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClPRelu.h" -#include "src/gpu/cl/kernels/ClElementwiseKernel.h" - #include "src/common/utils/Log.h" +#include "src/gpu/cl/kernels/ClElementwiseKernel.h" namespace arm_compute { namespace opencl { using KernelType = kernels::ClArithmeticKernel; -void ClPRelu::configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output) +void ClPRelu::configure(const CLCompileContext &compile_context, + ITensorInfo *input, + ITensorInfo *alpha, + ITensorInfo *output) { ARM_COMPUTE_LOG_PARAMS(input, alpha, output); auto k = std::make_unique<KernelType>(); @@ -49,7 +51,7 @@ void ClPRelu::run(ITensorPack &tensors) { // Output tensor can be given as nullptr for in-place computation. // In this case, get the input tensor and use it as the output tensor. - if(tensors.get_tensor(TensorType::ACL_DST) == nullptr) + if (tensors.get_tensor(TensorType::ACL_DST) == nullptr) { auto src_tensor = const_cast<ITensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_0)); ARM_COMPUTE_ERROR_ON_MSG(src_tensor == nullptr, "invalid source tensor is given for in-place computation"); @@ -58,4 +60,4 @@ void ClPRelu::run(ITensorPack &tensors) IClOperator::run(tensors); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPRelu.h b/src/gpu/cl/operators/ClPRelu.h index 8084ab86cd..45ce858fb0 100644 --- a/src/gpu/cl/operators/ClPRelu.h +++ b/src/gpu/cl/operators/ClPRelu.h @@ -47,7 +47,8 @@ public: * @param[in] alpha PRelu layer parameters. Data types supported: same of @p input. * @param[out] output Destination tensor. Data type supported: same as @p input */ - void configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); + void + configure(const CLCompileContext &compile_context, ITensorInfo *input, ITensorInfo *alpha, ITensorInfo *output); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPRelu::configure() diff --git a/src/gpu/cl/operators/ClPermute.cpp b/src/gpu/cl/operators/ClPermute.cpp index ed56f97bfe..3851e22b6a 100644 --- a/src/gpu/cl/operators/ClPermute.cpp +++ b/src/gpu/cl/operators/ClPermute.cpp @@ -23,16 +23,18 @@ */ #include "src/gpu/cl/operators/ClPermute.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPermuteKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPermute::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm) +void ClPermute::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm) { ARM_COMPUTE_LOG_PARAMS(src, dst, perm); auto k = std::make_unique<kernels::ClPermuteKernel>(); @@ -45,4 +47,4 @@ Status ClPermute::validate(const ITensorInfo *src, const ITensorInfo *dst, const return kernels::ClPermuteKernel::validate(src, dst, perm); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPermute.h b/src/gpu/cl/operators/ClPermute.h index 3e87329f9b..6349358a18 100644 --- a/src/gpu/cl/operators/ClPermute.h +++ b/src/gpu/cl/operators/ClPermute.h @@ -44,7 +44,10 @@ public: * @param[in] dst The dst tensor info. Data types supported: Same as @p src * @param[in] perm Permutation vector */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const PermutationVector &perm); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const PermutationVector &perm); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPermute::configure() @@ -55,4 +58,4 @@ public: }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_PERMUTE_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_PERMUTE_H */ diff --git a/src/gpu/cl/operators/ClPool2d.cpp b/src/gpu/cl/operators/ClPool2d.cpp index 3da90b8ced..e4507dc1a1 100644 --- a/src/gpu/cl/operators/ClPool2d.cpp +++ b/src/gpu/cl/operators/ClPool2d.cpp @@ -25,16 +25,19 @@ #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPool2dKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices) +void ClPool2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info, indices); @@ -49,7 +52,10 @@ void ClPool2d::configure(const ClCompileContext &compile_context, ITensorInfo *s CLScheduler::get().tune_kernel_static(*_kernel); } -Status ClPool2d::validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices) +Status ClPool2d::validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices) { return kernels::ClPool2dKernel::validate(src, dst, info, indices); } diff --git a/src/gpu/cl/operators/ClPool2d.h b/src/gpu/cl/operators/ClPool2d.h index f353ba262e..9c2fd1c3f2 100644 --- a/src/gpu/cl/operators/ClPool2d.h +++ b/src/gpu/cl/operators/ClPool2d.h @@ -50,14 +50,21 @@ public: * @param[in] info Pooling layer parameters. * @param[out] indices (optional) The indices info of the maximal values. Data type supported: U32. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const PoolingLayerInfo &info, ITensorInfo *indices = nullptr); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const PoolingLayerInfo &info, + ITensorInfo *indices = nullptr); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *dst, const PoolingLayerInfo &info, const ITensorInfo *indices = nullptr); + static Status validate(const ITensorInfo *src, + const ITensorInfo *dst, + const PoolingLayerInfo &info, + const ITensorInfo *indices = nullptr); }; } // namespace opencl } // namespace arm_compute diff --git a/src/gpu/cl/operators/ClPool3d.cpp b/src/gpu/cl/operators/ClPool3d.cpp index 7dec6c5958..d230413659 100644 --- a/src/gpu/cl/operators/ClPool3d.cpp +++ b/src/gpu/cl/operators/ClPool3d.cpp @@ -25,16 +25,18 @@ #include "arm_compute/runtime/CL/CLScheduler.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClPool3dKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClPool3d::configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info) +void ClPool3d::configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info); diff --git a/src/gpu/cl/operators/ClPool3d.h b/src/gpu/cl/operators/ClPool3d.h index 7d994fd194..9fd78bfd69 100644 --- a/src/gpu/cl/operators/ClPool3d.h +++ b/src/gpu/cl/operators/ClPool3d.h @@ -51,7 +51,10 @@ public: * @param[out] dst Destination tensor info. * @param[in] info 3d Pooling layer parameters. */ - void configure(const ClCompileContext &compile_context, const ITensorInfo *src, ITensorInfo *dst, const Pooling3dLayerInfo &info); + void configure(const ClCompileContext &compile_context, + const ITensorInfo *src, + ITensorInfo *dst, + const Pooling3dLayerInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClPool3d::configure() diff --git a/src/gpu/cl/operators/ClQuantize.cpp b/src/gpu/cl/operators/ClQuantize.cpp index 47ae5cea47..8560b5553e 100644 --- a/src/gpu/cl/operators/ClQuantize.cpp +++ b/src/gpu/cl/operators/ClQuantize.cpp @@ -25,10 +25,10 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/gpu/cl/ClCompileContext.h" -#include "src/gpu/cl/kernels/ClQuantizeKernel.h" #include "src/common/utils/Log.h" +#include "src/gpu/cl/ClCompileContext.h" +#include "src/gpu/cl/kernels/ClQuantizeKernel.h" namespace arm_compute { diff --git a/src/gpu/cl/operators/ClReshape.cpp b/src/gpu/cl/operators/ClReshape.cpp index 560966f4fc..1dd5b760cb 100644 --- a/src/gpu/cl/operators/ClReshape.cpp +++ b/src/gpu/cl/operators/ClReshape.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClReshape.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClReshapeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClReshape::validate(const ITensorInfo *src, const ITensorInfo *dst) return kernels::ClReshapeKernel::validate(src, dst); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScale.cpp b/src/gpu/cl/operators/ClScale.cpp index 0798b19ca0..184e2aa006 100644 --- a/src/gpu/cl/operators/ClScale.cpp +++ b/src/gpu/cl/operators/ClScale.cpp @@ -25,17 +25,20 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClScaleKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClScale::configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info) +void ClScale::configure(const CLCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *dst, + const ScaleKernelInfo &info) { ARM_COMPUTE_ERROR_ON_NULLPTR(src); ARM_COMPUTE_LOG_PARAMS(src, dst, info); @@ -61,4 +64,4 @@ void ClScale::run(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_kernel.get(), tensors); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClScale.h b/src/gpu/cl/operators/ClScale.h index af97cf23e7..1427bb4fdc 100644 --- a/src/gpu/cl/operators/ClScale.h +++ b/src/gpu/cl/operators/ClScale.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SCALE_H #include "arm_compute/core/KernelDescriptors.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -49,7 +50,8 @@ public: * All but the lowest two dimensions must be the same size as in the input tensor, i.e. scaling is only performed within the XY-plane. * @param[in] info @ref ScaleKernelInfo descriptor to be used to configure */ - void configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); + void + configure(const CLCompileContext &compile_context, ITensorInfo *src, ITensorInfo *dst, const ScaleKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClScale::configure() diff --git a/src/gpu/cl/operators/ClSoftmax.cpp b/src/gpu/cl/operators/ClSoftmax.cpp index 03809553a3..2bec400597 100644 --- a/src/gpu/cl/operators/ClSoftmax.cpp +++ b/src/gpu/cl/operators/ClSoftmax.cpp @@ -22,7 +22,10 @@ * SOFTWARE. */ #include "src/gpu/cl/operators/ClSoftmax.h" + #include "arm_compute/core/utils/misc/ShapeCalculator.h" + +#include "src/common/utils/Log.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/core/helpers/SoftmaxHelpers.h" #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" @@ -30,8 +33,6 @@ #include "src/gpu/cl/utils/ClAuxTensorHandler.h" #include "support/Cast.h" -#include "src/common/utils/Log.h" - using namespace arm_compute::experimental; namespace arm_compute @@ -52,7 +53,10 @@ ClSoftmax::ClSoftmax() { } -void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info) +void ClSoftmax::configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info) { ARM_COMPUTE_ERROR_THROW_ON(validate(src, dst, info)); ARM_COMPUTE_LOG_PARAMS(src, dst, info); @@ -64,14 +68,15 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor const ITensorInfo &tmp_input_info = _needs_permute ? _permuted_src_info : src; ITensorInfo &tmp_output_info = _needs_permute ? _permuted_dst_info : dst; - if(_needs_permute) + if (_needs_permute) { const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); _permute_input->configure(compile_context, &src, &_permuted_src_info, perm_info); } - DataType tmp_data_type = is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); - _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); + DataType tmp_data_type = + is_data_type_quantized_asymmetric(tmp_input_info.data_type()) ? DataType::S32 : tmp_input_info.data_type(); + _tmp_info = tmp_input_info.clone()->set_data_type(tmp_data_type); TensorShape max_sum_shape = tmp_input_info.tensor_shape(); _max_info = tmp_input_info.clone()->set_tensor_shape(max_sum_shape); @@ -83,33 +88,41 @@ void ClSoftmax::configure(const CLCompileContext &compile_context, const ITensor _max_shift_exp_sum_kernel->configure(compile_context, tmp_input_info, _max_info, _tmp_info, _sum_info, info); _norm_kernel->configure(compile_context, _tmp_info, _sum_info, tmp_output_info, info); - if(_needs_permute) + if (_needs_permute) { const auto perm_info = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); _permute_output->configure(compile_context, &_permuted_dst_info, &dst, perm_info); } - _aux_mem[InternalTensorIdx::SUM] = MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); - _aux_mem[InternalTensorIdx::TMP] = MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); - _aux_mem[InternalTensorIdx::MAX] = MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); - - _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), MemoryLifetime::Temporary, _permuted_src_info.total_size()); - _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), MemoryLifetime::Temporary, _permuted_dst_info.total_size()); + _aux_mem[InternalTensorIdx::SUM] = + MemoryInfo(offset_int_vec(InternalTensorIdx::SUM), MemoryLifetime::Temporary, _sum_info.total_size()); + _aux_mem[InternalTensorIdx::TMP] = + MemoryInfo(offset_int_vec(InternalTensorIdx::TMP), MemoryLifetime::Temporary, _tmp_info.total_size()); + _aux_mem[InternalTensorIdx::MAX] = + MemoryInfo(offset_int_vec(InternalTensorIdx::MAX), MemoryLifetime::Temporary, _max_info.total_size()); + + _aux_mem[InternalTensorIdx::PERMUTED_SRC] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), + MemoryLifetime::Temporary, _permuted_src_info.total_size()); + _aux_mem[InternalTensorIdx::PERMUTED_DST] = MemoryInfo(offset_int_vec(InternalTensorIdx::PERMUTED_DST), + MemoryLifetime::Temporary, _permuted_dst_info.total_size()); } Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info) { ARM_COMPUTE_RETURN_ERROR_ON_MSG(src.num_dimensions() > 4, "Only up to 4 dimensions are supported"); ARM_COMPUTE_UNUSED(info.beta); - ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || static_cast<int32_t>(src.num_dimensions()) <= info.axis); + ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) || + static_cast<int32_t>(src.num_dimensions()) <= info.axis); - const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); + const size_t actual_axis = static_cast<size_t>(wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()))); const bool needs_permute = actual_axis != 0; - if(needs_permute) + if (needs_permute) { - const PermutationVector permutation_vector = softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); - const TensorShape permuted_shape = misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); - TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); + const PermutationVector permutation_vector = + softmax_helpers::get_permutation_vector_from_softmax_axis(actual_axis); + const TensorShape permuted_shape = + misc::shape_calculator::compute_permutation_output_shape(src, permutation_vector); + TensorInfo input_permuted(src.clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&src, &input_permuted, permutation_vector)); TensorInfo output_permuted(dst.clone()->set_tensor_shape(permuted_shape)); ARM_COMPUTE_RETURN_ON_ERROR(ClPermute::validate(&output_permuted, &dst, permutation_vector)); @@ -122,9 +135,14 @@ Status ClSoftmax::validate(const ITensorInfo &src, const ITensorInfo &dst, const TensorShape max_sum_shape = src.tensor_shape(); max_sum_shape.set(0, 1); TensorInfo tensor_info_max(src.clone()->set_tensor_shape(max_sum_shape).set_is_resizable(true)); - TensorInfo tensor_info_sum(src.clone()->set_tensor_shape(max_sum_shape).set_data_type(tmp_data_type).set_quantization_info(QuantizationInfo()).set_is_resizable(true)); - - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); + TensorInfo tensor_info_sum(src.clone() + ->set_tensor_shape(max_sum_shape) + .set_data_type(tmp_data_type) + .set_quantization_info(QuantizationInfo()) + .set_is_resizable(true)); + + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClLogits1DMaxShiftExpSumKernel::validate(src, tensor_info_max, tensor_info_tmp, tensor_info_sum)); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClLogits1DNormKernel::validate(tensor_info_tmp, tensor_info_sum, dst, info)); return Status{}; @@ -139,10 +157,12 @@ void ClSoftmax::run(ITensorPack &tensors) CLAuxTensorHandler tmp(offset_int_vec(InternalTensorIdx::TMP), _tmp_info, tensors, false); CLAuxTensorHandler max(offset_int_vec(InternalTensorIdx::MAX), _max_info, tensors, false); - CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, false); - CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, false); + CLAuxTensorHandler permuted_src(offset_int_vec(InternalTensorIdx::PERMUTED_SRC), _permuted_src_info, tensors, + false); + CLAuxTensorHandler permuted_dst(offset_int_vec(InternalTensorIdx::PERMUTED_DST), _permuted_dst_info, tensors, + false); - if(_needs_permute) + if (_needs_permute) { ITensorPack pack; pack.add_const_tensor(TensorType::ACL_SRC, src); @@ -152,7 +172,7 @@ void ClSoftmax::run(ITensorPack &tensors) ITensorPack sum_pack; ITensorPack norm_pack; - if(_needs_permute) + if (_needs_permute) { sum_pack.add_const_tensor(TensorType::ACL_SRC, permuted_src.get()); norm_pack.add_tensor(TensorType::ACL_DST, permuted_dst.get()); @@ -172,7 +192,7 @@ void ClSoftmax::run(ITensorPack &tensors) CLScheduler::get().enqueue_op(*_max_shift_exp_sum_kernel.get(), sum_pack, false); CLScheduler::get().enqueue_op(*_norm_kernel.get(), norm_pack, false); - if(_needs_permute) + if (_needs_permute) { ITensorPack pack; pack.add_const_tensor(TensorType::ACL_SRC, permuted_dst.get()); @@ -186,4 +206,4 @@ experimental::MemoryRequirements ClSoftmax::workspace() const return _aux_mem; } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClSoftmax.h b/src/gpu/cl/operators/ClSoftmax.h index 6c9af585d6..6c2aaaea80 100644 --- a/src/gpu/cl/operators/ClSoftmax.h +++ b/src/gpu/cl/operators/ClSoftmax.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SOFTMAX_H #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -52,7 +53,10 @@ public: * @param[out] dst Destination tensor info. Data types supported: same as @p src * @param[in] info Contains information consumed by kernels for softmax described in @ref SoftmaxKernelInfo. */ - void configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo &src, + ITensorInfo &dst, + const SoftmaxKernelInfo &info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClSoftmax::configure() @@ -61,7 +65,7 @@ public: */ static Status validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info); // Inherited methods overridden: - void run(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: @@ -79,7 +83,7 @@ private: std::unique_ptr<ClPermute> _permute_output; std::unique_ptr<kernels::ClLogits1DMaxShiftExpSumKernel> _max_shift_exp_sum_kernel; std::unique_ptr<kernels::ClLogits1DNormKernel> _norm_kernel; - bool _needs_permute{ false }; + bool _needs_permute{false}; TensorInfo _max_info; TensorInfo _sum_info; @@ -90,6 +94,6 @@ private: experimental::MemoryRequirements _aux_mem{}; }; -} // opencl -} // arm_compute -#endif /* ARM_COMPUTE_CL_SOFTMAX_H */
\ No newline at end of file +} // namespace opencl +} // namespace arm_compute +#endif /* ARM_COMPUTE_CL_SOFTMAX_H */ diff --git a/src/gpu/cl/operators/ClSub.cpp b/src/gpu/cl/operators/ClSub.cpp index 53be04a70f..5c6d0c3184 100644 --- a/src/gpu/cl/operators/ClSub.cpp +++ b/src/gpu/cl/operators/ClSub.cpp @@ -23,17 +23,20 @@ */ #include "src/gpu/cl/operators/ClSub.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClElementwiseKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl { -void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +void ClSub::configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { ARM_COMPUTE_LOG_PARAMS(src1, src2, dst, policy, act_info); auto k = std::make_unique<kernels::ClSaturatedArithmeticKernel>(); @@ -41,8 +44,11 @@ void ClSub::configure(const ClCompileContext &compile_context, ITensorInfo *src1 _kernel = std::move(k); } -Status ClSub::validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, - ConvertPolicy policy, const ActivationLayerInfo &act_info) +Status ClSub::validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, + const ActivationLayerInfo &act_info) { return kernels::ClSaturatedArithmeticKernel::validate(ArithmeticOperation::SUB, src1, src2, dst, policy, act_info); } diff --git a/src/gpu/cl/operators/ClSub.h b/src/gpu/cl/operators/ClSub.h index 7eac437143..6a97275b86 100644 --- a/src/gpu/cl/operators/ClSub.h +++ b/src/gpu/cl/operators/ClSub.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_SUB_H #include "arm_compute/function_info/ActivationLayerInfo.h" + #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -65,7 +66,11 @@ public: * @param[in] policy Policy to use to handle overflow. * @param[in] act_info (Optional) Activation layer information in case of a fused activation. */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src1, ITensorInfo *src2, ITensorInfo *dst, ConvertPolicy policy, + void configure(const ClCompileContext &compile_context, + ITensorInfo *src1, + ITensorInfo *src2, + ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); /** Static function to check if given info will lead to a valid configuration * @@ -73,7 +78,10 @@ public: * * @return a status */ - static Status validate(const ITensorInfo *src1, const ITensorInfo *src2, const ITensorInfo *dst, ConvertPolicy policy, + static Status validate(const ITensorInfo *src1, + const ITensorInfo *src2, + const ITensorInfo *dst, + ConvertPolicy policy, const ActivationLayerInfo &act_info = ActivationLayerInfo()); }; } // namespace opencl diff --git a/src/gpu/cl/operators/ClTranspose.cpp b/src/gpu/cl/operators/ClTranspose.cpp index 26feffe2b9..28da0d640a 100644 --- a/src/gpu/cl/operators/ClTranspose.cpp +++ b/src/gpu/cl/operators/ClTranspose.cpp @@ -23,11 +23,10 @@ */ #include "src/gpu/cl/operators/ClTranspose.h" +#include "src/common/utils/Log.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/kernels/ClTransposeKernel.h" -#include "src/common/utils/Log.h" - namespace arm_compute { namespace opencl @@ -45,4 +44,4 @@ Status ClTranspose::validate(const ITensorInfo *src, const ITensorInfo *dst) return kernels::ClTransposeKernel::validate(src, dst); } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClTransposedConvolution.cpp b/src/gpu/cl/operators/ClTransposedConvolution.cpp index 90dbe7f291..cec438faeb 100644 --- a/src/gpu/cl/operators/ClTransposedConvolution.cpp +++ b/src/gpu/cl/operators/ClTransposedConvolution.cpp @@ -25,6 +25,7 @@ #include "arm_compute/core/Error.h" #include "arm_compute/runtime/CL/CLScheduler.h" + #include "src/common/utils/Log.h" #include "src/gpu/cl/kernels/ClTransposedConvolutionKernel.h" @@ -32,8 +33,12 @@ namespace arm_compute { namespace opencl { -void ClTransposedConvolution::configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info) +void ClTransposedConvolution::configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info) { ARM_COMPUTE_ERROR_ON_NULLPTR(input); ARM_COMPUTE_LOG_PARAMS(input, weights, biases, output, deconv_info); @@ -43,10 +48,14 @@ void ClTransposedConvolution::configure(const CLCompileContext &compile_context, _transposed_conv_kernel = std::move(kernel_object); } -Status ClTransposedConvolution::validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info) +Status ClTransposedConvolution::validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info) { - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClTransposedConvolutionKernel::validate(input, weights, biases, output, deconv_info)); return Status{}; } diff --git a/src/gpu/cl/operators/ClTransposedConvolution.h b/src/gpu/cl/operators/ClTransposedConvolution.h index 58ebc689ed..660c4f85c1 100644 --- a/src/gpu/cl/operators/ClTransposedConvolution.h +++ b/src/gpu/cl/operators/ClTransposedConvolution.h @@ -68,23 +68,30 @@ public: * @param[in] deconv_info Contains padding and stride information described in @ref PadStrideInfo. * */ - void configure(const CLCompileContext &compile_context, const ITensorInfo *input, const ITensorInfo *weights, - const ITensorInfo *biases, ITensorInfo *output, const PadStrideInfo &deconv_info); + void configure(const CLCompileContext &compile_context, + const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + ITensorInfo *output, + const PadStrideInfo &deconv_info); /** Static function to check if given info will lead to a valid configuration * * Similar to ClTransposedConvolution::configure() * * @return a status */ - static Status validate(const ITensorInfo *input, const ITensorInfo *weights, const ITensorInfo *biases, - const ITensorInfo *output, const PadStrideInfo &deconv_info); + static Status validate(const ITensorInfo *input, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *output, + const PadStrideInfo &deconv_info); // Inherited method overridden void run(ITensorPack &tensors) override; private: - std::unique_ptr<IClKernel> _transposed_conv_kernel{ nullptr }; + std::unique_ptr<IClKernel> _transposed_conv_kernel{nullptr}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_TRANSPOSED_CONVOLUTION_H */ diff --git a/src/gpu/cl/operators/ClWinogradConv2d.cpp b/src/gpu/cl/operators/ClWinogradConv2d.cpp index b4163a5986..8ec96b247e 100644 --- a/src/gpu/cl/operators/ClWinogradConv2d.cpp +++ b/src/gpu/cl/operators/ClWinogradConv2d.cpp @@ -24,20 +24,19 @@ #include "src/gpu/cl/operators/ClWinogradConv2d.h" #include "arm_compute/core/CL/ICLTensor.h" -#include "arm_compute/core/Utils.h" -#include "arm_compute/core/Validate.h" #include "arm_compute/core/experimental/Types.h" +#include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/misc/ShapeCalculator.h" +#include "arm_compute/core/Validate.h" #include "arm_compute/runtime/CL/CLScheduler.h" -#include "src/core/CL/kernels/CLFillBorderKernel.h" + +#include "src/common/utils/Log.h" #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/core/helpers/MemoryHelpers.h" #include "src/gpu/cl/kernels/ClWinogradFilterTransformKernel.h" #include "src/gpu/cl/kernels/ClWinogradInputTransformKernel.h" #include "src/gpu/cl/kernels/ClWinogradOutputTransformKernel.h" #include "src/gpu/cl/utils/ClAuxTensorHandler.h" - -#include "src/common/utils/Log.h" #include "support/Cast.h" using namespace arm_compute::experimental; @@ -55,15 +54,16 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, const unsigned int kernel_max_dim = std::max(kernel_dims.width, kernel_dims.height); // Check if the input spatial dimensions are smaller than 4 - const bool is_input_lt4_nchw = (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); + const bool is_input_lt4_nchw = + (input_dims.width <= 4 && input_dims.height <= 4) && (data_layout == DataLayout::NCHW); - if(kernel_max_dim == 3U) + if (kernel_max_dim == 3U) { - if(kernel_dims == Size2D(3U, 3U)) + if (kernel_dims == Size2D(3U, 3U)) { output_tile = is_input_lt4_nchw ? Size2D(2U, 2U) : Size2D(4U, 4U); } - else if(kernel_dims == Size2D(3U, 1U)) + else if (kernel_dims == Size2D(3U, 1U)) { output_tile = is_input_lt4_nchw ? Size2D(2U, 1U) : Size2D(4U, 1U); } @@ -72,15 +72,13 @@ Size2D winograd_output_tile(const Size2D &input_dims, const Size2D &kernel_dims, output_tile = is_input_lt4_nchw ? Size2D(1U, 2U) : Size2D(1U, 4U); } } - else if(kernel_max_dim == 5U) + else if (kernel_max_dim == 5U) { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, - kernel_dims.height == 1 ? 1U : 4U); + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 4U, kernel_dims.height == 1 ? 1U : 4U); } - else if(kernel_max_dim == 7U) + else if (kernel_max_dim == 7U) { - output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, - kernel_dims.height == 1 ? 1U : 2U); + output_tile = Size2D(kernel_dims.width == 1 ? 1U : 2U, kernel_dims.height == 1 ? 1U : 2U); } return output_tile; @@ -91,11 +89,9 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz // Check if we want to configure a Winograd configuration which requires fast math using WinogradConfiguration = std::pair<std::pair<int, int>, std::pair<int, int>>; - std::vector<WinogradConfiguration> fast_math_winograd = - { + std::vector<WinogradConfiguration> fast_math_winograd = { WinogradConfiguration(std::pair<int, int>(4, 4), std::pair<int, int>(5, 5)), - WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7)) - }; + WinogradConfiguration(std::pair<int, int>(2, 2), std::pair<int, int>(7, 7))}; auto p = std::make_pair(std::pair<int, int>(output_tile.width, output_tile.height), std::pair<int, int>(kernel_size.width, kernel_size.height)); @@ -103,8 +99,13 @@ bool check_support_fast_math(const Size2D &output_tile, const Size2D &kernel_siz return std::find(fast_math_winograd.begin(), fast_math_winograd.end(), p) != fast_math_winograd.end(); } -Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status validate_arguments(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { // Get indeces for the width and height const size_t idx_width = get_data_layout_dimension_index(src->data_layout(), DataLayoutDimension::WIDTH); @@ -115,41 +116,49 @@ Status validate_arguments(const ITensorInfo *src, const ITensorInfo *weights, co const Size2D kernel_size = Size2D(weights->tensor_shape()[idx_width], weights->tensor_shape()[idx_height]); const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), "Winograd only supports padding up to half kernel size"); - ARM_COMPUTE_RETURN_ERROR_ON_MSG(((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_left() > (kernel_size.x() / 2u)) || (conv_info.pad_right() > (kernel_size.x() / 2u))), + "Winograd only supports padding up to half kernel size"); + ARM_COMPUTE_RETURN_ERROR_ON_MSG( + ((conv_info.pad_top() > (kernel_size.y() / 2u)) || (conv_info.pad_bottom() > (kernel_size.y() / 2u))), + "Winograd only supports padding up to half kernel size"); // Check if the Winograd configuration requires fast math - if(!enable_fast_math) + if (!enable_fast_math) { - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN( + src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_RETURN_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); // Validate input transform - const TensorShape input0_shape = misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); - const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); + const TensorShape input0_shape = + misc::shape_calculator::compute_winograd_input_transform_shape(*src, winograd_info); + const TensorInfo input0 = src->clone()->set_tensor_shape(input0_shape); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradInputTransformKernel::validate(src, &input0, winograd_info)); // Validate filter transform - const TensorShape input1_shape = misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); - const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); + const TensorShape input1_shape = + misc::shape_calculator::compute_winograd_filter_transform_shape(*weights, winograd_info); + const TensorInfo input1 = weights->clone()->set_tensor_shape(input1_shape); ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradFilterTransformKernel::validate(weights, &input1, winograd_info)); // Validate batched matrix multiply TensorShape batched_mm_output_shape = input0.tensor_shape(); batched_mm_output_shape[0] = input1.tensor_shape()[0]; const TensorInfo batched_mm_output = input0.clone()->set_tensor_shape(batched_mm_output_shape); - ARM_COMPUTE_RETURN_ON_ERROR(ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, - GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); + ARM_COMPUTE_RETURN_ON_ERROR( + ClGemm::validate(&input0, &input1, nullptr, &batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16)))); // Configure output transform - ARM_COMPUTE_RETURN_ON_ERROR(kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); + ARM_COMPUTE_RETURN_ON_ERROR( + kernels::ClWinogradOutputTransformKernel::validate(&batched_mm_output, biases, dst, winograd_info, act_info)); return Status{}; } @@ -171,8 +180,14 @@ ClWinogradConv2d::ClWinogradConv2d() ClWinogradConv2d::~ClWinogradConv2d() = default; -void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, - const PadStrideInfo &conv_info, const ActivationLayerInfo &act_info, bool enable_fast_math) +void ClWinogradConv2d::configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); ARM_COMPUTE_LOG_PARAMS(src, weights, biases, dst, conv_info, act_info, enable_fast_math); @@ -187,50 +202,53 @@ void ClWinogradConv2d::configure(const ClCompileContext &compile_context, ITenso const Size2D output_tile = winograd_output_tile(input_dims, kernel_size, src->data_layout()); // Check if the Winograd configuration requires fast math - if(!enable_fast_math) + if (!enable_fast_math) { - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, DataType::F32); //disable winograd for fp16 if fast math is false. - ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), "This Winograd configuration requires enable_fast_math=true"); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(src, 1, + DataType::F32); //disable winograd for fp16 if fast math is false. + ARM_COMPUTE_ERROR_ON_MSG(check_support_fast_math(output_tile, kernel_size), + "This Winograd configuration requires enable_fast_math=true"); } - const WinogradInfo winograd_info = WinogradInfo(output_tile, - kernel_size, - input_dims, - conv_info, - src->data_layout()); + const WinogradInfo winograd_info = + WinogradInfo(output_tile, kernel_size, input_dims, conv_info, src->data_layout()); _is_prepared = false; // Configure input transform _input_transform->configure(compile_context, src, &_input0, winograd_info); - _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, PixelValue()); + _border_handler.configure(compile_context, src, _input_transform->border_size(), BorderMode::CONSTANT, + PixelValue()); // Configure filter transform _filter_transform->configure(compile_context, weights, &_input1, winograd_info); // Configure batched matrix multiply - _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, - false, false, - GEMMLowpOutputStageInfo(), - (src->data_type() == DataType::F16))); + _batched_mm.configure(compile_context, &_input0, &_input1, nullptr, &_batched_mm_output, 1.0f, 0.0f, + GEMMInfo(false, false, true /* Reshape weights only for the first run*/, 0, false, false, + GEMMLowpOutputStageInfo(), (src->data_type() == DataType::F16))); // Configure output transform _output_transform->set_target(CLScheduler::get().target()); _output_transform->configure(compile_context, &_batched_mm_output, biases, dst, winograd_info, act_info); - _aux_mem = _batched_mm.workspace(); - const MemoryLifetime wino_wei_lifetm = std::any_of(std::begin(_aux_mem), std::end(_aux_mem), [](const auto & r) - { - return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); - }) ? - MemoryLifetime::Prepare : - MemoryLifetime::Persistent; + _aux_mem = _batched_mm.workspace(); + const MemoryLifetime wino_wei_lifetm = + std::any_of(std::begin(_aux_mem), std::end(_aux_mem), + [](const auto &r) { return (r.lifetime == MemoryLifetime::Persistent) && (r.size > 0); }) + ? MemoryLifetime::Prepare + : MemoryLifetime::Persistent; _aux_mem.push_back(MemoryInfo(offset_int_vec(2), MemoryLifetime::Temporary, _input0.total_size())); _aux_mem.push_back(MemoryInfo(offset_int_vec(3), wino_wei_lifetm, _input1.total_size())); _aux_mem.push_back(MemoryInfo(offset_int_vec(4), MemoryLifetime::Temporary, _batched_mm_output.total_size())); } -Status ClWinogradConv2d::validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info, bool enable_fast_math) +Status ClWinogradConv2d::validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info, + bool enable_fast_math) { ARM_COMPUTE_RETURN_ON_ERROR(validate_arguments(src, weights, biases, dst, conv_info, act_info, enable_fast_math)); return Status{}; @@ -251,10 +269,9 @@ void ClWinogradConv2d::run(ITensorPack &tensors) prepare(tensors); // Run input transform - ITensorPack pack_it - { - { TensorType::ACL_SRC, src }, - { TensorType::ACL_DST, input0.get() }, + ITensorPack pack_it{ + {TensorType::ACL_SRC, src}, + {TensorType::ACL_DST, input0.get()}, }; CLScheduler::get().enqueue_op(_border_handler, pack_it, false); CLScheduler::get().enqueue_op(*_input_transform, pack_it, false); @@ -263,31 +280,31 @@ void ClWinogradConv2d::run(ITensorPack &tensors) ITensorPack pack_mm = tensors; pack_mm.add_const_tensor(TensorType::ACL_SRC_0, input0.get()); pack_mm.add_tensor(TensorType::ACL_DST, batched_mm_output.get()); - is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); + is_gemm_reshaped ? pack_mm.remove_tensor(TensorType::ACL_SRC_1) + : pack_mm.add_const_tensor(TensorType::ACL_SRC_1, input1.get()); _batched_mm.run(pack_mm); // Run output transform - ITensorPack pack_ot - { - { TensorType::ACL_SRC_0, batched_mm_output.get() }, - { TensorType::ACL_SRC_1, biases }, - { TensorType::ACL_DST, dst }, + ITensorPack pack_ot{ + {TensorType::ACL_SRC_0, batched_mm_output.get()}, + {TensorType::ACL_SRC_1, biases}, + {TensorType::ACL_DST, dst}, }; CLScheduler::get().enqueue_op(*_output_transform, pack_ot); } void ClWinogradConv2d::prepare(ITensorPack &tensors) { - if(!_is_prepared) + if (!_is_prepared) { - auto weights = utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); + auto weights = + utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC_1)); ICLTensor *in1_aux = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(offset_int_vec(3))); CLAuxTensorHandler input1(_input1, *in1_aux); - ITensorPack pack_ft - { - { TensorType::ACL_SRC, weights }, - { TensorType::ACL_DST, input1.get() }, + ITensorPack pack_ft{ + {TensorType::ACL_SRC, weights}, + {TensorType::ACL_DST, input1.get()}, }; // Run filter transform and mark original weights as unused CLScheduler::get().enqueue_op(*_filter_transform, pack_ft, false); @@ -308,4 +325,4 @@ experimental::MemoryRequirements ClWinogradConv2d::workspace() const return _aux_mem; } } // namespace opencl -} // namespace arm_compute
\ No newline at end of file +} // namespace arm_compute diff --git a/src/gpu/cl/operators/ClWinogradConv2d.h b/src/gpu/cl/operators/ClWinogradConv2d.h index eb2f7a72b2..54ec1a1737 100644 --- a/src/gpu/cl/operators/ClWinogradConv2d.h +++ b/src/gpu/cl/operators/ClWinogradConv2d.h @@ -25,6 +25,7 @@ #define ARM_COMPUTE_CL_WINOGRADCONV2D_H #include "arm_compute/runtime/CL/CLTensor.h" + #include "src/core/CL/kernels/CLFillBorderKernel.h" #include "src/gpu/cl/ClCompileContext.h" #include "src/gpu/cl/IClOperator.h" @@ -41,7 +42,7 @@ namespace kernels class ClWinogradInputTransformKernel; class ClWinogradFilterTransformKernel; class ClWinogradOutputTransformKernel; -} // kernels +} // namespace kernels /** Basic function to execute Winograd-based convolution on OpenCL. This function calls the following OpenCL functions/kernels: * * -# @ref kernels::ClWinogradInputTransformKernel @@ -93,20 +94,31 @@ public: * @param[in] enable_fast_math (Optional) Enable fast math computation. In case this flag were set, the function could dispatch the fastest implementation * available which may introduce a drop of accuracy as well. Default is false */ - void configure(const ClCompileContext &compile_context, ITensorInfo *src, ITensorInfo *weights, ITensorInfo *biases, ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + void configure(const ClCompileContext &compile_context, + ITensorInfo *src, + ITensorInfo *weights, + ITensorInfo *biases, + ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); /** Static function to check if given info will lead to a valid configuration * * Similar to ClWinogradConv2d::configure() * * @return a status */ - static Status validate(const ITensorInfo *src, const ITensorInfo *weights, const ITensorInfo *biases, const ITensorInfo *dst, const PadStrideInfo &conv_info, - const ActivationLayerInfo &act_info = ActivationLayerInfo(), bool enable_fast_math = false); + static Status validate(const ITensorInfo *src, + const ITensorInfo *weights, + const ITensorInfo *biases, + const ITensorInfo *dst, + const PadStrideInfo &conv_info, + const ActivationLayerInfo &act_info = ActivationLayerInfo(), + bool enable_fast_math = false); // Inherited method overridden - void run(ITensorPack &tensors) override; - void prepare(ITensorPack &tensors) override; + void run(ITensorPack &tensors) override; + void prepare(ITensorPack &tensors) override; experimental::MemoryRequirements workspace() const override; private: diff --git a/src/gpu/cl/utils/ClAuxTensorHandler.h b/src/gpu/cl/utils/ClAuxTensorHandler.h index af383489a1..81dc3baef4 100644 --- a/src/gpu/cl/utils/ClAuxTensorHandler.h +++ b/src/gpu/cl/utils/ClAuxTensorHandler.h @@ -39,25 +39,26 @@ namespace opencl class CLAuxTensorHandler { public: - CLAuxTensorHandler(int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) + CLAuxTensorHandler( + int slot_id, TensorInfo &info, ITensorPack &pack, bool pack_inject = false, bool bypass_alloc = false) : _tensor() { - if(info.total_size() == 0) + if (info.total_size() == 0) { return; } _tensor.allocator()->soft_init(info); ICLTensor *packed_tensor = utils::cast::polymorphic_downcast<ICLTensor *>(pack.get_tensor(slot_id)); - if((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) + if ((packed_tensor == nullptr) || (info.total_size() > packed_tensor->info()->total_size())) { - if(!bypass_alloc) + if (!bypass_alloc) { _tensor.allocator()->allocate(); ARM_COMPUTE_LOG_INFO_WITH_FUNCNAME_ACL("Allocating auxiliary tensor"); } - if(pack_inject) + if (pack_inject) { pack.add_tensor(slot_id, &_tensor); _injected_tensor_pack = &pack; @@ -70,22 +71,21 @@ public: } } - CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) - : _tensor() + CLAuxTensorHandler(TensorInfo &info, ICLTensor &tensor) : _tensor() { _tensor.allocator()->soft_init(info); - if(info.total_size() <= tensor.info()->total_size()) + if (info.total_size() <= tensor.info()->total_size()) { _tensor.allocator()->import_memory(tensor.cl_buffer()); } } - CLAuxTensorHandler(const CLAuxTensorHandler &) = delete; + CLAuxTensorHandler(const CLAuxTensorHandler &) = delete; CLAuxTensorHandler &operator=(const CLAuxTensorHandler) = delete; ~CLAuxTensorHandler() { - if(_injected_tensor_pack) + if (_injected_tensor_pack) { _injected_tensor_pack->remove_tensor(_injected_slot_id); } @@ -103,9 +103,9 @@ public: private: CLTensor _tensor{}; - ITensorPack *_injected_tensor_pack{ nullptr }; - int _injected_slot_id{ TensorType::ACL_UNKNOWN }; + ITensorPack *_injected_tensor_pack{nullptr}; + int _injected_slot_id{TensorType::ACL_UNKNOWN}; }; } // namespace opencl } // namespace arm_compute -#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */
\ No newline at end of file +#endif /* ARM_COMPUTE_CL_UTILS_CL_AUX_TENSOR_HANDLER_H */ |