From ab0a77edcb9f48de2aad216323b791d0dd95a3cd Mon Sep 17 00:00:00 2001 From: Michele Di Giorgio Date: Wed, 21 Jun 2017 15:36:24 +0100 Subject: COMPMID-409: Add support for QS8 and QS16 CLPixelWiseMultiplication. Change-Id: I7f66d49d746ba9fb6e726ccab83d3a97b8ddef80 Reviewed-on: http://mpd-gerrit.cambridge.arm.com/78491 Reviewed-by: Georgios Pinitas Tested-by: Kaizen --- src/core/CL/cl_kernels/fixed_point.h | 36 ++++++++++++++++---- src/core/CL/cl_kernels/pixelwise_mul_int.cl | 39 ++++++++++++++++------ .../CL/kernels/CLPixelWiseMultiplicationKernel.cpp | 38 +++++++++++++++++---- 3 files changed, 89 insertions(+), 24 deletions(-) (limited to 'src/core/CL') diff --git a/src/core/CL/cl_kernels/fixed_point.h b/src/core/CL/cl_kernels/fixed_point.h index dcdf840444..b0dab0affc 100644 --- a/src/core/CL/cl_kernels/fixed_point.h +++ b/src/core/CL/cl_kernels/fixed_point.h @@ -162,13 +162,34 @@ SUBQ_SAT_IMPL(qs8x16) #define SUB_SAT_OP_EXPAND_STR(a, b, type, size) sub_sat_##type##x##size((a), (b)) #define SUB_SAT_OP_EXPAND(a, b, type, size) SUB_SAT_OP_EXPAND_STR(a, b, type, size) -/** Saturate multiply of two fixed point numbers - * - * @param[in] type the actual data type. - * @param[in] itype the intermediate data type. - * - * @return The result of the fixed point multiplication. The result is saturated in case of overflow - */ +/* Multiply of two fixed point numbers + * + * @param[in] type the actual data type. + * @param[in] itype the intermediate data type. + * + * @return The result of the fixed point multiplication. + */ +#define MULQ_IMPL(type, itype) \ + inline type mul_##type(type VopA, type VopB, int fixed_point_position) \ + { \ + itype round_val = (itype)(1 << (fixed_point_position - 1)); \ + itype res = CONVERT((VopA), itype) * CONVERT((VopB), itype) + round_val; \ + return CONVERT((res >> (itype)fixed_point_position), type); \ + } + +MULQ_IMPL(qs8x16, qs16x16) +MULQ_IMPL(qs16x16, qs32x16) + +#define MUL_OP_EXPAND_STR(a, b, type, size, position) mul_##type##x##size((a), (b), (position)) +#define MUL_OP_EXPAND(a, b, type, size, position) MUL_OP_EXPAND_STR(a, b, type, size, position) + +/* Saturate multiply of two fixed point numbers + * + * @param[in] type the actual data type. + * @param[in] itype the intermediate data type. + * + * @return The result of the fixed point multiplication. The result is saturated in case of overflow + */ #define MULQ_SAT_IMPL(type, itype) \ inline type mul_sat_##type(type VopA, type VopB, int fixed_point_position) \ { \ @@ -179,6 +200,7 @@ SUBQ_SAT_IMPL(qs8x16) MULQ_SAT_IMPL(qs8x16, qs16x16) MULQ_SAT_IMPL(qs16x8, qs32x8) +MULQ_SAT_IMPL(qs16x16, qs32x16) #define MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) mul_sat_##type##x##size((a), (b), (position)) #define MUL_SAT_OP_EXPAND(a, b, type, size, position) MUL_SAT_OP_EXPAND_STR(a, b, type, size, position) diff --git a/src/core/CL/cl_kernels/pixelwise_mul_int.cl b/src/core/CL/cl_kernels/pixelwise_mul_int.cl index e6dfd3043d..a407a3264e 100644 --- a/src/core/CL/cl_kernels/pixelwise_mul_int.cl +++ b/src/core/CL/cl_kernels/pixelwise_mul_int.cl @@ -23,12 +23,28 @@ */ #include "helpers.h" -#ifdef SATURATE -#define CONVERT_OP_INT_STR(x, type) (convert_##type##_sat(x)) -#else /* SATURATE */ -#define CONVERT_OP_INT_STR(x, type) (convert_##type(x)) -#endif /* SATURATE */ -#define CONVERT_OP_INT(x, type) CONVERT_OP_INT_STR(x, type) +#if defined(FIXED_POINT_POSITION) + +#include "fixed_point.h" + +#if defined(SATURATE) +#define MUL_OP(x, y, scale, type, size) MUL_SAT_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) +#else // SATURATE +#define MUL_OP(x, y, scale, type, size) MUL_OP_EXPAND((x), (y), type, size, FIXED_POINT_POSITION) +#endif // SATURATE + +#else // FIXED_POINT_POSITION + +#if defined(SATURATE) +#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size##_sat(x)) +#else // SATURATE +#define CONVERT_OP_INT_STR(x, type, size) (convert_##type##size(x)) +#endif // SATURATE +#define CONVERT_OP_INT(x, type, size) CONVERT_OP_INT_STR(x, type, size) + +#define MUL_OP(x, y, scale, type, size) CONVERT_OP_INT((x) * (y) >> scale, type, size) + +#endif // FIXED_POINT_POSITION /** Performs a pixelwise multiplication with integer scale of integer inputs. * @@ -36,26 +52,27 @@ * e.g. -DDATA_TYPE_IN1=uchar -DDATA_TYPE_IN2=ushort -DDATA_TYPE_OUT=short * @attention The data_type of the intermediate result of the multiplication should passed as well using -DDATA_TYPE_RES. * e.g. If one of inputs is S16 -DDATA_TYPE_RES=int should be passed else -DDATA_TYPE_RES=short. + * @note In case of fixed-point operation -DFIXED_POINT_POSITION=fixed_point_position must be provided: e.g. -DFIXED_POINT_POSITION=3 * - * @param[in] in1_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in1_ptr Pointer to the source image. Supported data types: U8/QS8/QS16/S16 * @param[in] in1_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in1_step_x in1_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in1_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] in1_step_y in1_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] in1_offset_first_element_in_bytes The offset of the first element in the source image - * @param[in] in2_ptr Pointer to the source image. Supported data types: U8, S16 + * @param[in] in2_ptr Pointer to the source image. Supported data types: same as @p in1_ptr * @param[in] in2_stride_x Stride of the source image in X dimension (in bytes) * @param[in] in2_step_x in2_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] in2_stride_y Stride of the source image in Y dimension (in bytes) * @param[in] in2_step_y in2_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] in2_offset_first_element_in_bytes The offset of the first element in the source image - * @param[out] out_ptr Pointer to the destination image. Supported data types: U8, S16 + * @param[out] out_ptr Pointer to the destination image. Supported data types: same as @p in1_ptr * @param[in] out_stride_x Stride of the destination image in X dimension (in bytes) * @param[in] out_step_x out_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] out_stride_y Stride of the destination image in Y dimension (in bytes) * @param[in] out_step_y out_stride_y * number of elements along Y processed per workitem(in bytes) * @param[in] out_offset_first_element_in_bytes The offset of the first element in the destination image - * @param[in] scale Integer scaling factor. Supported data types: S32 + * @param[in] scale Integer scaling factor. Supported data types: S32 (ignored for QS8 and QS16 as the assumption is scale = 1). */ __kernel void pixelwise_mul_int( IMAGE_DECLARATION(in1), @@ -75,5 +92,5 @@ __kernel void pixelwise_mul_int( in2_data = CONVERT(vload16(0, (__global DATA_TYPE_IN2 *)in2.ptr), VEC_DATA_TYPE(DATA_TYPE_RES, 16)); // Perform multiplication and store result - vstore16(CONVERT_OP_INT(((in1_data * in2_data) >> scale), VEC_DATA_TYPE(DATA_TYPE_OUT, 16)), 0, (__global DATA_TYPE_OUT *)out.ptr); + vstore16(MUL_OP(in1_data, in2_data, scale, DATA_TYPE_OUT, 16), 0, (__global DATA_TYPE_OUT *)out.ptr); } diff --git a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp index da417a9020..b95e8fac73 100644 --- a/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp +++ b/src/core/CL/kernels/CLPixelWiseMultiplicationKernel.cpp @@ -65,12 +65,19 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I } ARM_COMPUTE_ERROR_ON_MISMATCHING_SHAPES(input1, input2, output); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); - ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input1, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input2, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); + ARM_COMPUTE_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(output, 1, DataType::U8, DataType::QS8, DataType::QS16, DataType::S16, DataType::F16, DataType::F32); ARM_COMPUTE_ERROR_ON_MSG(output->info()->data_type() == DataType::U8 && (input1->info()->data_type() != DataType::U8 || input2->info()->data_type() != DataType::U8), "Output can only be U8 if both inputs are U8"); ARM_COMPUTE_ERROR_ON_MSG(scale < 0, "Scale cannot be negative. "); + if(is_data_type_fixed_point(input1->info()->data_type())) + { + // All data types must be all QS8 or all QS16 + ARM_COMPUTE_ERROR_ON_MISMATCHING_DATA_TYPES(input1, input2, output); + ARM_COMPUTE_ERROR_ON_MISMATCHING_FIXED_POINT_POSITION(input1, input2, output); + ARM_COMPUTE_ERROR_ON_MSG(scale != 1, "Unsupported scaling factor for QS8/QS16. Scale must be 1."); + } _input1 = input1; _input2 = input2; @@ -96,13 +103,28 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I if(is_data_type_float(input1->info()->data_type()) || is_data_type_float(input2->info()->data_type())) { scale_int = -1; - compute_type = (DataType::F32 == input1->info()->data_type() || DataType::F32 == input2->info()->data_type()) ? "float" : "half"; + compute_type = (input1->info()->data_type() == DataType::F32 || input2->info()->data_type() == DataType::F32) ? "float" : "half"; data_type = "DATA_TYPE_FLOAT"; } else { - compute_type = (DataType::S16 == input1->info()->data_type() || DataType::S16 == input2->info()->data_type()) ? "int" : "ushort"; - data_type = "DATA_TYPE_INT"; + if(input1->info()->data_type() == DataType::S16 || input2->info()->data_type() == DataType::S16) + { + compute_type = "int"; + } + else if(input1->info()->data_type() == DataType::QS8) + { + compute_type = "qs8"; + } + else if(input1->info()->data_type() == DataType::QS16) + { + compute_type = "qs16"; + } + else + { + compute_type = "ushort"; + } + data_type = "DATA_TYPE_INT"; } // Construct kernel name @@ -113,6 +135,10 @@ void CLPixelWiseMultiplicationKernel::configure(const ICLTensor *input1, const I std::set build_opts; build_opts.emplace((overflow_policy == ConvertPolicy::WRAP || is_data_type_float(output->info()->data_type())) ? "-DWRAP" : "-DSATURATE"); build_opts.emplace((rounding_policy == RoundingPolicy::TO_ZERO) ? "-DROUND=_rtz" : "-DROUND=_rte"); + if(is_data_type_fixed_point(input1->info()->data_type())) + { + build_opts.emplace("-DFIXED_POINT_POSITION=" + support::cpp11::to_string(input1->info()->fixed_point_position())); + } build_opts.emplace("-DDATA_TYPE_IN1=" + get_cl_type_from_data_type(input1->info()->data_type())); build_opts.emplace("-DDATA_TYPE_IN2=" + get_cl_type_from_data_type(input2->info()->data_type())); build_opts.emplace("-DDATA_TYPE_OUT=" + get_cl_type_from_data_type(output->info()->data_type())); -- cgit v1.2.1