aboutsummaryrefslogtreecommitdiff
path: root/src
diff options
context:
space:
mode:
authorRamy Elgammal <ramy.elgammal@arm.com>2023-01-30 04:56:47 +0000
committerRamy Elgammal <ramy.elgammal@arm.com>2023-03-17 12:45:26 +0000
commit14d7b535d48620f009efca576cc70fb6ea9ff20d (patch)
tree592ae31798825fba0b71b36adda008fb53381e9f /src
parent2b6ebfe4270b06b09e45f306e8384950aeca7e4e (diff)
downloadComputeLibrary-14d7b535d48620f009efca576cc70fb6ea9ff20d.tar.gz
Implementation of RSQRT for quantized int8
Resolves: COMPMID-5863 Change-Id: I9ff67face62826c1d335a6b941e8516be39bdac8 Signed-off-by: Ramy Elgammal <ramy.elgammal@arm.com> Reviewed-on: https://eu-gerrit-1.euhpc.arm.com/c/VisualCompute/ComputeLibrary/+/488768 Tested-by: bsgcomp <bsgcomp@arm.com> Comments-Addressed: bsgcomp <bsgcomp@arm.com> Reviewed-by: Gunes Bayir <gunes.bayir@arm.com> Reviewed-on: https://review.mlplatform.org/c/ml/ComputeLibrary/+/9225 Tested-by: Arm Jenkins <bsgcomp@arm.com> Comments-Addressed: Arm Jenkins <bsgcomp@arm.com> Benchmark: Arm Jenkins <bsgcomp@arm.com>
Diffstat (limited to 'src')
-rw-r--r--src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl77
-rw-r--r--src/gpu/cl/ClKernelLibrary.cpp5
-rw-r--r--src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp41
3 files changed, 110 insertions, 13 deletions
diff --git a/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
new file mode 100644
index 0000000000..2e4cdc53fe
--- /dev/null
+++ b/src/core/CL/cl_kernels/common/elementwise_unary_quantized.cl
@@ -0,0 +1,77 @@
+/*
+ * Copyright (c) 2023 Arm Limited.
+ *
+ * SPDX-License-Identifier: MIT
+ *
+ * Permission is hereby granted, free of charge, to any person obtaining a copy
+ * of this software and associated documentation files (the "Software"), to
+ * deal in the Software without restriction, including without limitation the
+ * rights to use, copy, modify, merge, publish, distribute, sublicense, and/or
+ * sell copies of the Software, and to permit persons to whom the Software is
+ * furnished to do so, subject to the following conditions:
+ *
+ * The above copyright notice and this permission notice shall be included in all
+ * copies or substantial portions of the Software.
+ *
+ * THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
+ * IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
+ * FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
+ * AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
+ * LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
+ * OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE
+ * SOFTWARE.
+ */
+#include "helpers.h"
+
+#if defined(DATA_TYPE) && defined(OPERATION)
+// Calculate reverse square root
+#define rsqrt_op(input) rsqrt(input)
+#if defined(VEC_SIZE)
+#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE)
+#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE)
+#define VEC_TYPE VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+#endif // defined(VEC_SIZE)
+
+/** Applies element wise unary operator in a tensor.
+ *
+ * @param[in] in_ptr Pointer to the source image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in] in_stride_x Stride of the source tensor in X dimension (in bytes)
+ * @param[in] in_step_x in_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] in_stride_y Stride of the source tensor in Y dimension (in bytes)
+ * @param[in] in_step_y in_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] in_stride_z Stride of the source tensor in Z dimension (in bytes)
+ * @param[in] in_step_z in_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] in_offset_first_element_in_bytes Offset of the first element in the source image
+ * @param[out] out_ptr Pointer to the destination image. Supported data types: QASYMM8/QASYMM8_SIGNED.
+ * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes)
+ * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes)
+ * @param[in] out_step_y Stride of the destination tensor in Y dimension (in bytes)
+ * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes)
+ * @param[in] out_stride_z Stride of the destination tensor in Z dimension (in bytes)
+ * @param[in] out_step_z out_stride_z * number of elements along Z processed per workitem(in bytes)
+ * @param[in] out_offset_first_element_in_bytes Offset of the first element in the destination image
+ */
+__kernel void elementwise_unary_quantized(
+ TENSOR3D_DECLARATION(in),
+ TENSOR3D_DECLARATION(out))
+{
+ Tensor3D in = CONVERT_TO_TENSOR3D_STRUCT(in);
+ Tensor3D out = CONVERT_TO_TENSOR3D_STRUCT(out);
+
+ // Check if access on width gets out of bounds
+ // If it does shift access vector to access elements within bounds
+ const int xi = (int)(get_global_id(0) * VEC_SIZE);
+ in.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * in_stride_x;
+ out.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * out_stride_x;
+
+ VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)
+ data = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in.ptr);
+ VEC_DATA_TYPE(float, VEC_SIZE)
+ data_f32 = CONVERT(data, VEC_FLOAT);
+ data_f32 = (data_f32 - (float)OFFSET_IN) * (float)SCALE_IN;
+ VEC_INT qres_int = CONVERT_SAT((OPERATION(data_f32) / ((VEC_FLOAT)(float)SCALE_OUT)), VEC_INT) + ((VEC_INT)((int)OFFSET_OUT));
+ const VEC_TYPE qres = CONVERT_SAT(qres_int, VEC_TYPE);
+ VSTORE(VEC_SIZE)
+ (qres, 0, (__global DATA_TYPE *)out.ptr);
+}
+#endif // defined(DATA_TYPE) && defined(OPERATION)
diff --git a/src/gpu/cl/ClKernelLibrary.cpp b/src/gpu/cl/ClKernelLibrary.cpp
index 482e8c341d..8099071fcd 100644
--- a/src/gpu/cl/ClKernelLibrary.cpp
+++ b/src/gpu/cl/ClKernelLibrary.cpp
@@ -235,6 +235,7 @@ const std::map<std::string, std::string> ClKernelLibrary::_kernel_program_map =
{ "elementwise_operation_SQUARED_DIFF_quantized", "common/elementwise_operation_quantized.cl" },
{ "elementwise_operation_PRELU_quantized", "common/elementwise_operation_quantized.cl" },
{ "elementwise_unary", "common/elementwise_unary.cl" },
+ { "elementwise_unary_quantized", "common/elementwise_unary_quantized.cl" },
{ "fft_digit_reverse_axis_0", "common/fft_digit_reverse.cl" },
{ "fft_digit_reverse_axis_1", "common/fft_digit_reverse.cl" },
{ "fft_radix_2_first_stage_axis_0", "common/fft.cl" },
@@ -572,6 +573,10 @@ const std::map<std::string, std::string> ClKernelLibrary::_program_source_map =
#include "./cl_kernels/common/elementwise_unary.clembed"
},
{
+ "common/elementwise_unary_quantized.cl",
+#include "./cl_kernels/common/elementwise_unary_quantized.clembed"
+ },
+ {
"common/fft.cl",
#include "./cl_kernels/common/fft.clembed"
},
diff --git a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
index 1f09515b86..40b1eaca1f 100644
--- a/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
+++ b/src/gpu/cl/kernels/ClElementwiseUnaryKernel.cpp
@@ -1,5 +1,5 @@
/*
- * Copyright (c) 2018-2021 Arm Limited.
+ * Copyright (c) 2018-2021, 2023 Arm Limited.
*
* SPDX-License-Identifier: MIT
*
@@ -38,6 +38,8 @@ namespace kernels
{
namespace
{
+constexpr unsigned int vector_size_byte_opencl = 16;
+
Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const ElementWiseUnary op)
{
ARM_COMPUTE_RETURN_ERROR_ON_F16_UNSUPPORTED(&src);
@@ -49,6 +51,10 @@ Status validate_arguments(const ITensorInfo &src, const ITensorInfo &dst, const
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::S32);
}
+ else if(op == ElementWiseUnary::RSQRT) // Allow quantized types for only RSQRT.
+ {
+ ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32, DataType::QASYMM8, DataType::QASYMM8_SIGNED);
+ }
else
{
ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(&src, 1, DataType::F16, DataType::F32);
@@ -78,17 +84,29 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
auto padding_info = get_padding_info({ src, dst });
ARM_COMPUTE_ERROR_THROW_ON(validate_arguments(*src, *dst, op));
+ const unsigned int num_elems_processed_per_iteration = adjust_vec_size(vector_size_byte_opencl / dst->element_size(), dst->dimension(0));
- const std::string kernel_name = "elementwise_unary";
- const int vec_size_x = 16 / dst->element_size();
- const int dst_width_x = dst->tensor_shape().x();
- const bool multi_access_x = (dst_width_x / vec_size_x > 0);
-
+ std::string kernel_name = "elementwise_unary";
+ const int vec_size_x = num_elems_processed_per_iteration;
+ const int dst_width_x = dst->dimension(0);
+ if(is_data_type_quantized(src->data_type()))
+ {
+ kernel_name += "_quantized";
+ }
// Set kernel build options
CLBuildOptions build_opts;
build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(src->data_type()));
- build_opts.add_option_if(multi_access_x, "-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
- build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+ build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x));
+ build_opts.add_option("-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(dst_width_x - vec_size_x, 0)));
+ if(is_data_type_quantized(src->data_type()))
+ {
+ const UniformQuantizationInfo iqinfo = src->quantization_info().uniform();
+ const UniformQuantizationInfo oqinfo = dst->quantization_info().uniform();
+ build_opts.add_option("-DOFFSET_IN=" + support::cpp11::to_string(iqinfo.offset));
+ build_opts.add_option("-DOFFSET_OUT=" + support::cpp11::to_string(oqinfo.offset));
+ build_opts.add_option("-DSCALE_IN=" + float_to_string_with_full_precision(iqinfo.scale));
+ build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(oqinfo.scale));
+ }
switch(op)
{
case ElementWiseUnary::RSQRT:
@@ -124,11 +142,8 @@ void ClElementWiseUnaryKernel::configure(const CLCompileContext &compile_context
// Configure kernel window
Window win = calculate_max_window(*dst);
- if(multi_access_x)
- {
- win.set(Window::DimX,
- Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
- }
+ win.set(Window::DimX, Window::Dimension(win.x().start(), ceil_to_multiple(win.x().end(), vec_size_x), vec_size_x));
+
ICLKernel::configure_internal(win);
ARM_COMPUTE_ERROR_ON(has_padding_changed(padding_info));