diff options
Diffstat (limited to 'src/core/CL')
-rw-r--r-- | src/core/CL/CLHelpers.cpp | 8 | ||||
-rw-r--r-- | src/core/CL/CLKernelLibrary.cpp | 2 | ||||
-rw-r--r-- | src/core/CL/cl_kernels/dequantization_layer.cl | 134 | ||||
-rw-r--r-- | src/core/CL/kernels/CLDequantizationLayerKernel.cpp | 39 |
4 files changed, 173 insertions, 10 deletions
diff --git a/src/core/CL/CLHelpers.cpp b/src/core/CL/CLHelpers.cpp index e80349e486..bb3cf7fda2 100644 --- a/src/core/CL/CLHelpers.cpp +++ b/src/core/CL/CLHelpers.cpp @@ -38,9 +38,11 @@ std::string get_cl_type_from_data_type(const DataType &dt) { case DataType::U8: case DataType::QASYMM8: + case DataType::QASYMM8_PER_CHANNEL: return "uchar"; case DataType::S8: case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: return "char"; case DataType::U16: return "ushort"; @@ -71,9 +73,11 @@ std::string get_cl_select_type_from_data_type(const DataType &dt) { case DataType::U8: case DataType::QASYMM8: + case DataType::QASYMM8_PER_CHANNEL: return "uchar"; case DataType::S8: case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: return "char"; case DataType::U16: return "ushort"; @@ -104,6 +108,8 @@ std::string get_data_size_from_data_type(const DataType &dt) case DataType::S8: case DataType::QSYMM8: case DataType::QASYMM8: + case DataType::QSYMM8_PER_CHANNEL: + case DataType::QASYMM8_PER_CHANNEL: return "8"; case DataType::U16: case DataType::S16: @@ -246,6 +252,8 @@ size_t preferred_vector_width(const cl::Device &device, const DataType dt) case DataType::S8: case DataType::QASYMM8: case DataType::QSYMM8: + case DataType::QSYMM8_PER_CHANNEL: + case DataType::QASYMM8_PER_CHANNEL: return device.getInfo<CL_DEVICE_PREFERRED_VECTOR_WIDTH_CHAR>(); case DataType::U16: case DataType::S16: diff --git a/src/core/CL/CLKernelLibrary.cpp b/src/core/CL/CLKernelLibrary.cpp index 4b3b37c3da..d1500f00b5 100644 --- a/src/core/CL/CLKernelLibrary.cpp +++ b/src/core/CL/CLKernelLibrary.cpp @@ -236,6 +236,8 @@ const std::map<std::string, std::string> CLKernelLibrary::_kernel_program_map = { "depthwise_im2col", "depthwise_convolution.cl" }, { "depthwise_vector_to_tensor", "depthwise_convolution.cl" }, { "dequantization_layer", "dequantization_layer.cl" }, + { "dequantization_layer_per_channel_nhwc", "dequantization_layer.cl" }, + { "dequantization_layer_per_channel_nchw", "dequantization_layer.cl" }, { "derivative", "derivative.cl" }, { "dilate", "dilate.cl" }, { "direct_convolution1x1", "direct_convolution1x1.cl" }, diff --git a/src/core/CL/cl_kernels/dequantization_layer.cl b/src/core/CL/cl_kernels/dequantization_layer.cl index 7d87dc6a2d..5826847a5e 100644 --- a/src/core/CL/cl_kernels/dequantization_layer.cl +++ b/src/core/CL/cl_kernels/dequantization_layer.cl @@ -87,5 +87,137 @@ __kernel void dequantization_layer( *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - (int)(OFFSET)) * (float)(SCALE)); #endif // defined(LAST_ACCESSED_X) } - #endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) && defined(SCALE) && defined(OFFSET) + +#if defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) +/** This performs per channel dequantization of 8-bit unsigned integers to floating point. (NCHW) + * + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8_PER_CHANNEL + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale Pointer to buffer with the per channel quantized scales + * @param[in] offset Pointer to buffer with the per channel quantized offsets + */ +__kernel void dequantization_layer_per_channel_nchw( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + __global float *scale, + __global int *offset) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + + // Load data + VEC_DATA_TYPE(int, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + + // Create scale and offset vectors + const VEC_DATA_TYPE(float, VEC_SIZE) + vscale = scale[get_global_id(2)]; + + const VEC_DATA_TYPE(int, VEC_SIZE) + voffset = offset[get_global_id(2)]; + + // Dequantize + VEC_DATA_TYPE(float, VEC_SIZE) + res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE)); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); +#else // !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - offset[get_global_id(2)]) * scale[get_global_id(2)]); +#endif // defined(LAST_ACCESSED_X) +} +/** This performs per channel dequantization of 8-bit unsigned integers to floating point. (NHWC) + * + * @note Source datatype should be given as a preprocessor argument using -DDATA_TYPE_SRC=type. e.g. -DDATA_TYPE_SRC=char + * @note Destination datatype should be given as a preprocessor argument using -DDATA_TYPE_DST=type. e.g. -DDATA_TYPE_DST=float + * @note Vector size should be given as a preprocessor argument using -DVEC_SIZE=size. e.g. -DVEC_SIZE=16 + * + * @param[in] input_ptr Pointer to the source tensor. Supported data types: QASYMM8_PER_CHANNEL + * @param[in] input_stride_x Stride of the source tensor in X dimension (in bytes) + * @param[in] input_step_x input_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] input_stride_y Stride of the source tensor in Y dimension (in bytes) + * @param[in] input_step_y input_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] input_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] input_step_z input_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] input_offset_first_element_in_bytes The offset of the first element in the source tensor + * @param[out] output_ptr Pointer to the destination tensor. Supported data types: F16/F32 + * @param[in] output_stride_x Stride of the destination tensor in X dimension (in bytes) + * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) + * @param[in] output_stride_y Stride of the destination tensor in Y dimension (in bytes) + * @param[in] output_step_y output_stride_y * number of elements along Y processed per workitem(in bytes) + * @param[in] output_stride_z Stride of the source tensor in Z dimension (in bytes) + * @param[in] output_step_z output_stride_z * number of elements along Z processed per workitem(in bytes) + * @param[in] output_offset_first_element_in_bytes The offset of the first element in the destination tensor + * @param[in] scale Pointer to buffer with the per channel quantized scales + * @param[in] offset Pointer to buffer with the per channel quantized offsets + */ +__kernel void dequantization_layer_per_channel_nhwc( + TENSOR3D_DECLARATION(input), + TENSOR3D_DECLARATION(output), + __global float *scale, + __global int *offset) +{ + // Get pixels pointer + Tensor3D input = CONVERT_TO_TENSOR3D_STRUCT(input); + Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); + +#if defined(LAST_ACCESSED_X) + // Check if access on width gets out of bounds + // If it does shift access vector to access elements within bounds + const int xi = (int)(get_global_id(0) * VEC_SIZE); + input.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * input_stride_x; + output.ptr -= max(xi - (int)LAST_ACCESSED_X, 0) * output_stride_x; + scale -= max(xi - (int)LAST_ACCESSED_X, 0); + offset -= max(xi - (int)LAST_ACCESSED_X, 0); + + // Load data + VEC_DATA_TYPE(int, VEC_SIZE) + val = CONVERT(VLOAD(VEC_SIZE)(0, (__global DATA_TYPE_SRC *)input.ptr), VEC_DATA_TYPE(int, VEC_SIZE)); + + // Create scale and offset vectors + const VEC_DATA_TYPE(float, VEC_SIZE) + vscale = VLOAD(VEC_SIZE)(0, &scale[xi]); + + const VEC_DATA_TYPE(int, VEC_SIZE) + voffset = VLOAD(VEC_SIZE)(0, &offset[xi]); + + // Dequantize + VEC_DATA_TYPE(float, VEC_SIZE) + res = vscale * CONVERT((val - voffset), VEC_DATA_TYPE(float, VEC_SIZE)); + + // Store result + VSTORE(VEC_SIZE) + (CONVERT(res, VEC_DATA_TYPE(DATA_TYPE_DST, VEC_SIZE)), 0, (__global DATA_TYPE_DST *)output.ptr); +#else // !defined(LAST_ACCESSED_X) + *((__global DATA_TYPE_DST *)(output.ptr)) = (DATA_TYPE_DST)((float)((int)(*((__global DATA_TYPE_SRC *)(input.ptr))) - offset[get_global_id(0)]) * scale[get_global_id(0)]); +#endif // defined(LAST_ACCESSED_X) +} +#endif // defined(VEC_SIZE) && defined(DATA_TYPE_SRC) && defined(DATA_TYPE_DST) diff --git a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp index 10a2878be7..3ec0b87636 100644 --- a/src/core/CL/kernels/CLDequantizationLayerKernel.cpp +++ b/src/core/CL/kernels/CLDequantizationLayerKernel.cpp @@ -40,7 +40,7 @@ namespace Status validate_arguments(const ITensorInfo *input, const ITensorInfo *output) { ARM_COMPUTE_RETURN_ERROR_ON_NULLPTR(input, output); - ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QSYMM8, DataType::QSYMM16); + ARM_COMPUTE_RETURN_ERROR_ON_DATA_TYPE_CHANNEL_NOT_IN(input, 1, DataType::QASYMM8, DataType::QASYMM8_PER_CHANNEL, DataType::QSYMM8, DataType::QSYMM16); if(output->tensor_shape().total_size() > 0) { @@ -95,20 +95,31 @@ void CLDequantizationLayerKernel::configure(const ICLTensor *input, ICLTensor *o } ICLKernel::configure_internal(win); - const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); - const int qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0; + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(input->info()->data_type()); + std::string kernel_name = "dequantization_layer"; // Create kernel CLBuildOptions build_opts; - build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); - build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset)); + if(!is_quantized_per_channel) + { + const UniformQuantizationInfo qinfo = input->info()->quantization_info().uniform(); + const int qoffset = is_data_type_quantized_asymmetric(input->info()->data_type()) ? qinfo.offset : 0; + build_opts.add_option("-DSCALE=" + float_to_string_with_full_precision(qinfo.scale)); + build_opts.add_option("-DOFFSET=" + support::cpp11::to_string(qoffset)); + } + else + { + kernel_name += "_per_channel"; + kernel_name += input->info()->data_layout() == DataLayout::NCHW ? "_nchw" : "_nhwc"; + } + build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(vec_size_x)); build_opts.add_option("-DDATA_TYPE_SRC=" + get_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DDATA_TYPE_DST=" + get_cl_type_from_data_type(output->info()->data_type())); build_opts.add_option_if(multi_access_x, "-DLAST_ACCESSED_X=" + support::cpp11::to_string(std::max<int>(output_width_x - vec_size_x, 0))); // Create kernel name - _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("dequantization_layer", build_opts.options())); + _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel(kernel_name, build_opts.options())); } Status CLDequantizationLayerKernel::validate(const ITensorInfo *input, const ITensorInfo *output) @@ -123,8 +134,18 @@ void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &qu ARM_COMPUTE_ERROR_ON_UNCONFIGURED_KERNEL(this); ARM_COMPUTE_ERROR_ON_INVALID_SUBWINDOW(ICLKernel::window(), window); - Window window_collapsed = window.collapse_if_possible(ICLKernel::window(), 3); - Window slice = window_collapsed.first_slice_window_3D(); + const bool is_quantized_per_channel = is_data_type_quantized_per_channel(_input->info()->data_type()); + + // Collapse windo + Window new_window = is_quantized_per_channel ? window.collapse_if_possible(ICLKernel::window(), 4) : window.collapse_if_possible(ICLKernel::window(), 3); + Window slice = new_window.first_slice_window_3D(); + + if(is_quantized_per_channel) + { + unsigned int idx = num_arguments_per_3D_tensor() * 2; //Skip the input and output parameters + _kernel.setArg(idx++, _input->quantization().scale->cl_buffer()); + _kernel.setArg(idx++, _input->quantization().offset->cl_buffer()); + } do { @@ -133,6 +154,6 @@ void CLDequantizationLayerKernel::run(const Window &window, cl::CommandQueue &qu add_3D_tensor_argument(idx, _output, slice); enqueue(queue, *this, slice, lws_hint()); } - while(window_collapsed.slide_window_slice_3D(slice)); + while(new_window.slide_window_slice_3D(slice)); } } // namespace arm_compute
\ No newline at end of file |