From 3d13af8a39f408318328a95d5329bc17fd923438 Mon Sep 17 00:00:00 2001 From: Georgios Pinitas Date: Tue, 4 Jun 2019 13:04:16 +0100 Subject: COMPMID-2235: Extend type support for CL/NEON DequantizationLayer. Adds support for: - QSYMM8 Change-Id: Ia0b839fc844ce0f968dad1b69a001f9a660dbcd5 Signed-off-by: Georgios Pinitas Reviewed-on: https://review.mlplatform.org/c/1378 Comments-Addressed: Arm Jenkins Tested-by: Arm Jenkins Reviewed-by: Manuel Bottini Reviewed-by: Michalis Spyrou --- src/core/CL/CLHelpers.cpp | 10 +-- src/core/CL/cl_kernels/dequantization_layer.cl | 15 +++-- .../CL/kernels/CLDequantizationLayerKernel.cpp | 12 ++-- .../NEON/kernels/NEDequantizationLayerKernel.cpp | 74 ++++++++++++++++++++-- src/core/Utils.cpp | 2 + src/runtime/CL/CLTensorAllocator.cpp | 9 +-- 6 files changed, 96 insertions(+), 26 deletions(-) (limited to 'src') diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index f4ceca8200..2e6ceb4433 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -37,11 +37,11 @@ std::string get_cl_type_from_data_type(const DataType &dt) switch(dt) { case DataType::U8: + case DataType::QASYMM8: return "uchar"; case DataType::S8: + case DataType::QSYMM8: return "char"; - case DataType::QASYMM8: - return "uchar"; case DataType::U16: return "ushort"; case DataType::S16: @@ -69,11 +69,11 @@ std::string get_cl_select_type_from_data_type(const DataType &dt) switch(dt) { case DataType::U8: + case DataType::QASYMM8: return "uchar"; case DataType::S8: + case DataType::QSYMM8: return "char"; - case DataType::QASYMM8: - return "uchar"; case DataType::U16: return "ushort"; case DataType::F16: @@ -100,6 +100,7 @@ std::string get_data_size_from_data_type(const DataType &dt) { case DataType::U8: case DataType::S8: + case DataType::QSYMM8: case DataType::QASYMM8: return "8"; case DataType::U16: @@ -241,6 +242,7 @@ size_t preferred_vector_width(const cl::Device &device, const DataType dt) case DataType::U8: case DataType::S8: case DataType::QASYMM8: + case DataType::QSYMM8: return device.getInfo(); case DataType::U16: case DataType::S16: diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl index 7307700473..ad3ed35480 100644 --- a/src/core/CL/cl_kernels/dequantization_layer.cl +++ b/src/core/CL/cl_kernels/dequantization_layer.cl @@ -23,16 +23,17 @@ */ #include "helpers.h" -#if defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET) +#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET) /** This performs the dequantization of 8-bit unsigned integers to floating point. * - * @note Datatype should be given as a preprocessor argument using -DDATA_TYPE=type. e.g. -DDATA_TYPE=float + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 * @note Quantization scale of input tensor is passed in with -DSCALE=scale. * @note Quantization offset of input tensor is passed in with -DOFFSET=offset. * - * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8/QSYMM8 * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) @@ -66,7 +67,7 @@ __kernel void dequantization_layer( // Load data VEC_DATA_TYPE(int, VEC_SIZE) - val = CONVERT(VLOAD(VEC_SIZE)(0, (__global uchar *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); // Create scale and offset vectors const VEC_DATA_TYPE(float, VEC_SIZE) @@ -81,10 +82,10 @@ __kernel void dequantization_layer( // Store result VSTORE(VEC_SIZE) - (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE)), 0, (__global DATA_TYPE *)output.ptr); + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); #else // !defined(VEC_SIZE) || !defined(LAST_ACCESSED_X) - *((__global DATA_TYPE *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global uchar *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE)); + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE)); #endif // defined(VEC_SIZE) && defined(LAST_ACCESSED_X) } -#endif // defined(VEC_SIZE) && defined(DATA_TYPE) && defined(SCALE) && defined(OFFSET) \ No newline at end of file +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET) diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp index 0b066837a9..e383bc475d 100644 --- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp @@ -40,7 +40,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8); if(output->tensor_shape().total_size() > 0) { @@ -95,15 +95,19 @@ void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *o } ICLKernel::configure_internal(win); - const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); + const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); + const int qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0; // Create kernel CLBuildOptions build_opts; build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qinfo.offset)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset)); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); - build_opts.add_option("-DDATA_TYPE=" + get_cl_type_from_data_type(output->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(input->info()->data_type())); + build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max(output_width_x - vec_size_x, 0))); + + // Create kernel name _kernel = static_cast(CLKernelLibrary::get().create_kernel("dequantization_layer", build_opts.options())); } diff --git a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp index a6dc0977d2..bf0a2ca7bf 100644 --- a/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp +++ b/src/core/NEON/kernels/NEDequantizationLayerKernel.cpp @@ -42,7 +42,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8); if(output->tensor_shape().total_size() > 0) { @@ -95,9 +95,11 @@ inline void store_result(float16_t *ptr, const float32x4x4_t &v) #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ template -void run_dequantization(const ITensor *input, ITensor *output, const Window &window) +void run_dequantization_qasymm8(const ITensor *input, ITensor *output, const Window &window) { - const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + const int32_t offset = qinfo.offset; const int window_step_x = 16; const auto window_start_x = static_cast(window.x().start()); @@ -120,7 +122,49 @@ void run_dequantization(const ITensor *input, ITensor *output, const Window &win for(; x <= (window_end_x - window_step_x); x += window_step_x) { const auto vin = wrapper::vloadq(in_ptr + x); - const auto vdeq = vdequantize(vin, qinfo); + const auto vdeq = vdequantize(vin, scale, offset); + + store_result(reinterpret_cast(out_ptr + x), vdeq); + } + + // Compute left-over elements + for(; x < window_end_x; ++x) + { + uint8_t val = *(in_ptr + x); + *(out_ptr + x) = static_cast(dequantize(val, scale, offset)); + } + }, + in, out); +} + +template +void run_dequantization_qsymm8(const ITensor *input, ITensor *output, const Window &window) +{ + const UniformQuantizationInfo &qinfo = input->info()->quantization_info().uniform(); + const float scale = qinfo.scale; + + const int window_step_x = 16; + const auto window_start_x = static_cast(window.x().start()); + const auto window_end_x = static_cast(window.x().end()); + + // Collapse window and reset first dimension to handle tail calculations manually + Window win_collapsed = window.collapse_if_possible(window, Window::DimZ); + win_collapsed.set(Window::DimX, Window::Dimension(0, 1, 1)); + + // Create iterators + Iterator in(input, win_collapsed); + Iterator out(output, win_collapsed); + + execute_window_loop(win_collapsed, [&](const Coordinates &) + { + const auto in_ptr = reinterpret_cast(in.ptr()); + const auto out_ptr = reinterpret_cast(out.ptr()); + + int x = window_start_x; + for(; x <= (window_end_x - window_step_x); x += window_step_x) + { + const auto vin = wrapper::vloadq(in_ptr + x); + const auto vdeq = vdequantize(vin, scale); store_result(reinterpret_cast(out_ptr + x), vdeq); } @@ -129,11 +173,27 @@ void run_dequantization(const ITensor *input, ITensor *output, const Window &win for(; x < window_end_x; ++x) { uint8_t val = *(in_ptr + x); - *(out_ptr + x) = static_cast(dequantize_qasymm8(val, qinfo)); + *(out_ptr + x) = static_cast(dequantize(val, scale)); } }, in, out); } + +template +void run_dequantization_core(const ITensor *input, ITensor *output, const Window &window) +{ + switch(input->info()->data_type()) + { + case DataType::QASYMM8: + run_dequantization_qasymm8(input, output, window); + break; + case DataType::QSYMM8: + run_dequantization_qsymm8(input, output, window); + break; + default: + ARM_COMPUTE_ERROR("Unsupported data type."); + } +} } // namespace NEDequantizationLayerKernel::NEDequantizationLayerKernel() @@ -173,11 +233,11 @@ void NEDequantizationLayerKernel::run(const Window &window, const ThreadInfo &in switch(_output->info()->data_type()) { case DataType::F32: - run_dequantization(_input, _output, window); + run_dequantization_core(_input, _output, window); break; #ifdef __ARM_FEATURE_FP16_VECTOR_ARITHMETIC case DataType::F16: - run_dequantization(_input, _output, window); + run_dequantization_core(_input, _output, window); break; #endif /* __ARM_FEATURE_FP16_VECTOR_ARITHMETIC */ default: diff --git a/src/core/Utils.cpp b/src/core/Utils.cpp index 22be0002ee..499a6c8b29 100644 --- a/src/core/Utils.cpp +++ b/src/core/Utils.cpp @@ -158,6 +158,8 @@ const std::string &arm_compute::string_from_data_type(DataType dt) { DataType::F32, "F32" }, { DataType::F64, "F64" }, { DataType::SIZET, "SIZET" }, + { DataType::QSYMM8, "QSYMM8" }, + { DataType::QSYMM8_PER_CHANNEL, "QSYMM8_PER_CHANNEL" }, { DataType::QASYMM8, "QASYMM8" }, { DataType::QSYMM16, "QSYMM16" }, }; diff --git a/src/runtime/CL/CLTensorAllocator.cpp b/src/runtime/CL/CLTensorAllocator.cpp index 63aa1ba9ea..f3f16cd8c0 100644 --- a/src/runtime/CL/CLTensorAllocator.cpp +++ b/src/runtime/CL/CLTensorAllocator.cpp @@ -87,11 +87,12 @@ void populate_quantization_info(CLFloatArray &scale, CLInt32Array &offset, const clear_quantization_arrays(scale, offset); // Create scale array - const size_t num_elements = qinfo.scale.size(); - const size_t element_size = sizeof(decltype(qinfo.scale)::value_type); - scale = CLFloatArray(num_elements + pad_size); + const std::vector &qscale = qinfo.scale(); + const size_t num_elements = qscale.size(); + const size_t element_size = sizeof(std::remove_reference::type::value_type); + scale = CLFloatArray(num_elements + pad_size); scale.resize(num_elements); - CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale.data()); + CLScheduler::get().queue().enqueueWriteBuffer(scale.cl_buffer(), CL_TRUE, 0, num_elements * element_size, qinfo.scale().data()); } } // namespace -- cgit v1.2.1