/* * Copyright (c) 2017-2021, 2023 Arm Limited. * * SPDX-License-Identifier: MIT * * Permission is hereby granted, free of charge, to any person obtaining a copy * of this software and associated documentation files (the "Software"), to * deal in the Software without restriction, including without limitation the * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or * sell copies of the Software, and to permit persons to whom the Software is * furnished to do so, subject to the following conditions: * * The above copyright notice and this permission notice shall be included in all * copies or substantial portions of the Software. * * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE * SOFTWARE. */ #include "src/gpu/cl/kernels/ClSoftmaxKernel.h" #include "arm_compute/core/CL/CLCompileContext.h" #include "arm_compute/core/CL/CLHelpers.h" #include "arm_compute/core/CL/ICLTensor.h" #include "arm_compute/core/CL/OpenCL.h" #include "arm_compute/core/CoreTypes.h" #include "arm_compute/core/Dimensions.h" #include "arm_compute/core/Error.h" #include "arm_compute/core/Helpers.h" #include "arm_compute/core/ITensorInfo.h" #include "arm_compute/core/ITensorPack.h" #include "arm_compute/core/KernelDescriptors.h" #include "arm_compute/core/Steps.h" #include "arm_compute/core/TensorShape.h" #include "arm_compute/core/Utils.h" #include "arm_compute/core/utils/DataTypeUtils.h" #include "arm_compute/core/utils/helpers/AdjustVecSize.h" #include "arm_compute/core/utils/StringUtils.h" #include "arm_compute/core/Validate.h" #include "arm_compute/core/Window.h" #include "src/core/helpers/WindowHelpers.h" #include "support/Cast.h" #include "support/StringSupport.h" #include namespace arm_compute { namespace opencl { namespace kernels { ClSoftmaxKernel::ClSoftmaxKernel() { } Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info) { ARM_COMPUTE_UNUSED(src, dst, info); ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst); ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( // &src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED); ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst); ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type()); ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast(-src.num_dimensions()) || static_cast(src.num_dimensions()) <= info.axis); if (is_data_type_quantized_asymmetric(src.data_type())) { ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0); ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() != get_softmax_output_quantization_info(src.data_type(), info.is_log)); } return Status{}; } void ClSoftmaxKernel::configure(const CLCompileContext &compile_context, const ITensorInfo &src, ITensorInfo &dst, const SoftmaxKernelInfo &info) { ARM_COMPUTE_UNUSED(compile_context, src, dst, info); const auto &dst_shape = dst.tensor_shape(); const auto data_type = src.data_type(); const auto element_size = src.element_size(); const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED; const auto src_qinfo = src.quantization_info().uniform(); const auto dst_qinfo = dst.quantization_info().uniform(); const auto axis = wrap_around(info.axis, static_cast(src.num_dimensions())); const auto length = dst_shape[axis]; const auto tmp_data_type = is_quantized ? DataType::F32 : data_type; const auto vec_size = adjust_vec_size(16 / element_size, dst_shape[0]); const auto vec_size_leftover = dst_shape[0] % vec_size; std::string kernel_name("softmax"); CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type)); build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size)); build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover)); build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length)); build_opts.add_option_if(info.is_log, "-DIS_LOG"); build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta)); build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED"); build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset)); build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale)); build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset)); build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale)); if (axis == 0) { kernel_name += "_x"; build_opts.add_option("-DSOFTMAX_X"); if (is_quantized) { _tmp_info = TensorInfo(dst_shape, 1, tmp_data_type); } } else { kernel_name += "_non_x"; build_opts.add_option("-DSOFTMAX_NON_X"); TensorShape tmp_shape; tmp_shape.set(0, length * vec_size, false); tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false); for (size_t i = 2; i <= static_cast(axis); ++i) { tmp_shape.set(i, dst_shape[i - 1], false); } for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i) { tmp_shape.set(i, dst_shape[i], false); } _tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type); } _kernel = create_kernel(compile_context, kernel_name, build_opts.options()); // Configure kernel window and kernel arguments. Window win = calculate_max_window(src, Steps(vec_size)); bool has_collapsed = true; win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS. win = win.collapse_if_possible(win, 2, has_collapsed); ARM_COMPUTE_ERROR_ON(!has_collapsed); ICLKernel::configure_internal(win); _axis = axis; _config_id = "softmax_" + lower_string(string_from_data_type(data_type)); _config_id += "_" + std::to_string(axis); _config_id += "_" + std::to_string(length); } void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue) { ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); const auto src = utils::cast::polymorphic_downcast(tensors.get_const_tensor(TensorType::ACL_SRC)); auto dst = utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_DST)); ICLTensor *tmp = (_tmp_info.total_size() > 0) ? utils::cast::polymorphic_downcast(tensors.get_tensor(TensorType::ACL_INT_0)) : nullptr; if (!_prepared) { _prepared = true; const auto *src_info = src->info(); const auto *dst_info = dst->info(); auto src_strides = src_info->strides_in_bytes(); auto dst_strides = dst_info->strides_in_bytes(); const auto src_stride_axis = src_strides[_axis]; const auto dst_stride_axis = dst_strides[_axis]; // This axis has been removed from execution window, hence we remove it from the list of strides // provided to the kernel. // In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID. src_strides.remove(_axis); dst_strides.remove(_axis); // Argument 0: src_ptr. _kernel.setArg(1, src_strides[0]); _kernel.setArg(2, src_strides[1]); _kernel.setArg(3, src_strides[2]); _kernel.setArg(4, src_info->offset_first_element_in_bytes()); // Argument 5: dst_ptr. _kernel.setArg(6, dst_strides[0]); _kernel.setArg(7, dst_strides[1]); _kernel.setArg(8, dst_strides[2]); _kernel.setArg(9, dst_info->offset_first_element_in_bytes()); if (tmp != nullptr) { const auto *tmp_info = tmp->info(); const auto &tmp_strides = tmp_info->strides_in_bytes(); // Argument 10: tmp_ptr. _kernel.setArg(11, tmp_strides[1]); _kernel.setArg(12, tmp_strides[2]); _kernel.setArg(13, tmp_strides[3]); _kernel.setArg(14, 0); } if (_axis > 0) { _kernel.setArg(15, src_stride_axis); _kernel.setArg(16, dst_stride_axis); } } _kernel.setArg(0, src->cl_buffer()); _kernel.setArg(5, dst->cl_buffer()); if (tmp != nullptr) { _kernel.setArg(10, tmp->cl_buffer()); } enqueue(queue, *this, window, lws_hint()); } const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const { return _tmp_info; } } // namespace kernels } // namespace opencl } // namespace arm_compute