aboutsummaryrefslogtreecommitdiff
path: root/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
diff options
context:
space:
mode:
Diffstat (limited to 'src/gpu/cl/kernels/ClSoftmaxKernel.cpp')
-rw-r--r--src/gpu/cl/kernels/ClSoftmaxKernel.cpp263
1 files changed, 263 insertions, 0 deletions
diff --git a/src/gpu/cl/kernels/ClSoftmaxKernel.cpp b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
new file mode 100644
index 0000000000..796345a923
--- /dev/null
+++ b/src/gpu/cl/kernels/ClSoftmaxKernel.cpp
@@ -0,0 +1,263 @@
+/*
+ * Copyright (c) 2017-2021, 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "src/gpu/cl/kernels/ClSoftmaxKernel.h"
+
+#include "arm_compute/core/CL/CLCompileContext.h"
+#include "arm_compute/core/CL/CLHelpers.h"
+#include "arm_compute/core/CL/ICLTensor.h"
+#include "arm_compute/core/CL/OpenCL.h"
+#include "arm_compute/core/CoreTypes.h"
+#include "arm_compute/core/Dimensions.h"
+#include "arm_compute/core/Error.h"
+#include "arm_compute/core/Helpers.h"
+#include "arm_compute/core/ITensorInfo.h"
+#include "arm_compute/core/ITensorPack.h"
+#include "arm_compute/core/KernelDescriptors.h"
+#include "arm_compute/core/Steps.h"
+#include "arm_compute/core/TensorShape.h"
+#include "arm_compute/core/Utils.h"
+#include "arm_compute/core/utils/DataTypeUtils.h"
+#include "arm_compute/core/utils/helpers/AdjustVecSize.h"
+#include "arm_compute/core/utils/StringUtils.h"
+#include "arm_compute/core/Validate.h"
+#include "arm_compute/core/Window.h"
+
+#include "src/core/helpers/WindowHelpers.h"
+#include "support/Cast.h"
+#include "support/StringSupport.h"
+
+#include <string>
+
+namespace arm_compute
+{
+namespace opencl
+{
+namespace kernels
+{
+
+ClSoftmaxKernel::ClSoftmaxKernel()
+{
+}
+
+Status ClSoftmaxKernel::validate(const ITensorInfo &src, const ITensorInfo &dst, const SoftmaxKernelInfo &info)
+{
+ ARM_COMPUTE_UNUSED(src, dst, info);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(src.num_dimensions() > 4);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_SHAPES(&src, &dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_NOT_IN( //
+ &src, DataType::F32, DataType::F16, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ ARM_COMPUTE_RETURN_ERROR_ON_MISMATCHING_DATA_TYPES(&src, &dst);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(info.input_data_type != src.data_type());
+ ARM_COMPUTE_RETURN_ERROR_ON(info.axis < static_cast<int32_t>(-src.num_dimensions()) ||
+ static_cast<int32_t>(src.num_dimensions()) <= info.axis);
+
+ if (is_data_type_quantized_asymmetric(src.data_type()))
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON(src.quantization_info().uniform().scale < 0);
+
+ ARM_COMPUTE_RETURN_ERROR_ON(dst.quantization_info() !=
+ get_softmax_output_quantization_info(src.data_type(), info.is_log));
+ }
+
+ return Status{};
+}
+
+void ClSoftmaxKernel::configure(const CLCompileContext &compile_context,
+ const ITensorInfo &src,
+ ITensorInfo &dst,
+ const SoftmaxKernelInfo &info)
+{
+ ARM_COMPUTE_UNUSED(compile_context, src, dst, info);
+
+ const auto &dst_shape = dst.tensor_shape();
+
+ const auto data_type = src.data_type();
+ const auto element_size = src.element_size();
+
+ const auto is_quantized = data_type == DataType::QASYMM8 || data_type == DataType::QASYMM8_SIGNED;
+ const auto src_qinfo = src.quantization_info().uniform();
+ const auto dst_qinfo = dst.quantization_info().uniform();
+
+ const auto axis = wrap_around(info.axis, static_cast<int32_t>(src.num_dimensions()));
+ const auto length = dst_shape[axis];
+
+ const auto tmp_data_type = is_quantized ? DataType::F32 : data_type;
+
+ const auto vec_size = adjust_vec_size(16 / element_size, dst_shape[0]);
+ const auto vec_size_leftover = dst_shape[0] % vec_size;
+
+ std::string kernel_name("softmax");
+ CLBuildOptions build_opts;
+
+ build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(data_type));
+ build_opts.add_option("-DTMP_DATA_TYPE=" + get_cl_type_from_data_type(tmp_data_type));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size));
+ build_opts.add_option("-DVEC_SIZE_LEFTOVER=" + support::cpp11::to_string(vec_size_leftover));
+ build_opts.add_option("-DLENGTH=" + support::cpp11::to_string(length));
+ build_opts.add_option_if(info.is_log, "-DIS_LOG");
+ build_opts.add_option("-DBETA=" + float_to_string_with_full_precision(info.beta));
+
+ build_opts.add_option_if(is_quantized, "-DIS_QUANTIZED");
+ build_opts.add_option_if(is_quantized, "-DSRC_OFFSET=" + float_to_string_with_full_precision(src_qinfo.offset));
+ build_opts.add_option_if(is_quantized, "-DSRC_SCALE=" + float_to_string_with_full_precision(src_qinfo.scale));
+ build_opts.add_option_if(is_quantized, "-DDST_OFFSET=" + float_to_string_with_full_precision(dst_qinfo.offset));
+ build_opts.add_option_if(is_quantized, "-DDST_SCALE=" + float_to_string_with_full_precision(dst_qinfo.scale));
+
+ if (axis == 0)
+ {
+ kernel_name += "_x";
+ build_opts.add_option("-DSOFTMAX_X");
+
+ if (is_quantized)
+ {
+ _tmp_info = TensorInfo(dst_shape, 1, tmp_data_type);
+ }
+ }
+ else
+ {
+ kernel_name += "_non_x";
+ build_opts.add_option("-DSOFTMAX_NON_X");
+
+ TensorShape tmp_shape;
+
+ tmp_shape.set(0, length * vec_size, false);
+ tmp_shape.set(1, dst_shape[0] + (vec_size - vec_size_leftover) % vec_size, false);
+
+ for (size_t i = 2; i <= static_cast<size_t>(axis); ++i)
+ {
+ tmp_shape.set(i, dst_shape[i - 1], false);
+ }
+
+ for (size_t i = axis + 1; i < dst_shape.num_dimensions(); ++i)
+ {
+ tmp_shape.set(i, dst_shape[i], false);
+ }
+
+ _tmp_info = TensorInfo(tmp_shape, 1, tmp_data_type);
+ }
+
+ _kernel = create_kernel(compile_context, kernel_name, build_opts.options());
+
+ // Configure kernel window and kernel arguments.
+ Window win = calculate_max_window(src, Steps(vec_size));
+
+ bool has_collapsed = true;
+
+ win = win.shift_dimensions(1, axis); // Remove this axis from the window/GWS.
+ win = win.collapse_if_possible(win, 2, has_collapsed);
+ ARM_COMPUTE_ERROR_ON(!has_collapsed);
+
+ ICLKernel::configure_internal(win);
+
+ _axis = axis;
+
+ _config_id = "softmax_" + lower_string(string_from_data_type(data_type));
+ _config_id += "_" + std::to_string(axis);
+ _config_id += "_" + std::to_string(length);
+}
+
+void ClSoftmaxKernel::run_op(ITensorPack &tensors, const Window &window, cl::CommandQueue &queue)
+{
+ ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this);
+ ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window);
+
+ const auto src =
+ utils::cast::polymorphic_downcast<const ICLTensor *>(tensors.get_const_tensor(TensorType::ACL_SRC));
+ auto dst = utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_DST));
+ ICLTensor *tmp = (_tmp_info.total_size() > 0)
+ ? utils::cast::polymorphic_downcast<ICLTensor *>(tensors.get_tensor(TensorType::ACL_INT_0))
+ : nullptr;
+
+ if (!_prepared)
+ {
+ _prepared = true;
+
+ const auto *src_info = src->info();
+ const auto *dst_info = dst->info();
+ auto src_strides = src_info->strides_in_bytes();
+ auto dst_strides = dst_info->strides_in_bytes();
+
+ const auto src_stride_axis = src_strides[_axis];
+ const auto dst_stride_axis = dst_strides[_axis];
+
+ // This axis has been removed from execution window, hence we remove it from the list of strides
+ // provided to the kernel.
+ // In case axis > 0, src/dst_stride_axis will be provided in dedicated argument independent from global ID.
+ src_strides.remove(_axis);
+ dst_strides.remove(_axis);
+
+ // Argument 0: src_ptr.
+ _kernel.setArg<cl_uint>(1, src_strides[0]);
+ _kernel.setArg<cl_uint>(2, src_strides[1]);
+ _kernel.setArg<cl_uint>(3, src_strides[2]);
+ _kernel.setArg<cl_uint>(4, src_info->offset_first_element_in_bytes());
+
+ // Argument 5: dst_ptr.
+ _kernel.setArg<cl_uint>(6, dst_strides[0]);
+ _kernel.setArg<cl_uint>(7, dst_strides[1]);
+ _kernel.setArg<cl_uint>(8, dst_strides[2]);
+ _kernel.setArg<cl_uint>(9, dst_info->offset_first_element_in_bytes());
+
+ if (tmp != nullptr)
+ {
+ const auto *tmp_info = tmp->info();
+ const auto &tmp_strides = tmp_info->strides_in_bytes();
+
+ // Argument 10: tmp_ptr.
+ _kernel.setArg<cl_uint>(11, tmp_strides[1]);
+ _kernel.setArg<cl_uint>(12, tmp_strides[2]);
+ _kernel.setArg<cl_uint>(13, tmp_strides[3]);
+ _kernel.setArg<cl_uint>(14, 0);
+ }
+
+ if (_axis > 0)
+ {
+ _kernel.setArg<cl_uint>(15, src_stride_axis);
+ _kernel.setArg<cl_uint>(16, dst_stride_axis);
+ }
+ }
+
+ _kernel.setArg(0, src->cl_buffer());
+ _kernel.setArg(5, dst->cl_buffer());
+
+ if (tmp != nullptr)
+ {
+ _kernel.setArg(10, tmp->cl_buffer());
+ }
+
+ enqueue(queue, *this, window, lws_hint());
+}
+
+const TensorInfo &ClSoftmaxKernel::tmp_tensor_info() const
+{
+ return _tmp_info;
+}
+
+} // namespace kernels
+} // namespace opencl
+} // namespace arm_compute