diff options
Diffstat (limited to 'src/core')
5 files changed, 92 insertions, 9 deletions
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl index dc381803e6..c374769423 100644 --- a/src/core/CL/cl_kernels/concatenate.cl +++ b/src/core/CL/cl_kernels/concatenate.cl @@ -23,8 +23,22 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(VEC_SIZE) +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) +inline VEC_UCHAR requantize(VEC_UCHAR input, float in_offset, float out_offset, float in_scale, float out_scale) +{ + const VEC_FLOAT in_f32 = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale); + const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset)); + const VEC_UCHAR res_u8 = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_UCHAR); + return res_u8; +} +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ +#if defined(DATA_TYPE) && defined(VEC_SIZE) #if defined(DEPTH) && defined(ELEMENT_SIZE) #if defined(INPUT1_WIDTH) @@ -50,6 +64,7 @@ #else // VEC_SIZE #error "Vector size not supported" #endif // VEC_SIZE + /** This kernel concatenates two input tensors into the output tensor along the first dimension * * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float @@ -112,9 +127,15 @@ __kernel void concatenate_width_x2( const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w; const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w; - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) + src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); + src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) */ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x); const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE)); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values = select(src2_values, src1_values, cond); @@ -223,10 +244,21 @@ __kernel void concatenate_width_x4( const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w; const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w; - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr); + +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) + src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); + src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT); + src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT); + src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x); @@ -275,6 +307,7 @@ __kernel void concatenate_width_x4( * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ + __kernel void concatenate_width( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst)) @@ -285,9 +318,16 @@ __kernel void concatenate_width( VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + const VEC_UCHAR out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); + VSTORE(VEC_SIZE) + (out, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET); +#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ VSTORE(VEC_SIZE) (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ } + #endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */ /** This kernel concatenates the input tensor into the output tensor along the third dimension @@ -324,7 +364,12 @@ __kernel void concatenate_depth( VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0)); +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + VSTORE(VEC_SIZE) (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z)); + } #endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */ diff --git a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp index 40023948b1..3fccc0447d 100644 --- a/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLDepthConcatenateLayerKernel.cpp @@ -113,6 +113,13 @@ void CLDepthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned i CLBuildOptions build_opts; build_opts.add_option("-DDATA_TYPE=" + get_underlying_cl_type_from_data_type(input->info()->data_type())); build_opts.add_option("-DVEC_SIZE=" + support::cpp11::to_string(num_elems_processed_per_iteration)); + if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info()) + { + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale)); + } // Create kernel _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_depth", build_opts.options())); diff --git a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp index 79bc9a5674..d58cef57de 100644 --- a/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp +++ b/src/core/CL/kernels/CLWidthConcatenate2TensorsKernel.cpp @@ -111,6 +111,16 @@ void CLWidthConcatenate2TensorsKernel::configure(const ICLTensor *input1, const build_opts.add_option("-DINPUT1_WIDTH=" + support::cpp11::to_string(input1->info()->dimension(0))); build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size())); + if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info()) + { + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale)); + build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale)); + } + // Create kernel _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x2", build_opts.options())); diff --git a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp index 2db59df7f2..9cbb7130b7 100644 --- a/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp +++ b/src/core/CL/kernels/CLWidthConcatenate4TensorsKernel.cpp @@ -133,6 +133,20 @@ void CLWidthConcatenate4TensorsKernel::configure(const ICLTensor *input1, const build_opts.add_option("-DINPUT3_WIDTH=" + support::cpp11::to_string(input3->info()->dimension(0))); build_opts.add_option("-DELEMENT_SIZE=" + support::cpp11::to_string(input1->info()->element_size())); + if(is_data_type_quantized_asymmetric(input1->info()->data_type()) && input1->info()->quantization_info() != output->info()->quantization_info()) + { + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input1->info()->quantization_info().scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale)); + build_opts.add_option("-DOFFSET_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN2=" + float_to_string_with_full_precision(input2->info()->quantization_info().scale)); + build_opts.add_option("-DOFFSET_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN3=" + float_to_string_with_full_precision(input3->info()->quantization_info().scale)); + build_opts.add_option("-DOFFSET_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN4=" + float_to_string_with_full_precision(input4->info()->quantization_info().scale)); + } + // Create kernel _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width_x4", build_opts.options())); diff --git a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp index c51c5796d1..6c32cd2371 100644 --- a/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp +++ b/src/core/CL/kernels/CLWidthConcatenateLayerKernel.cpp @@ -1,5 +1,5 @@ /* - * Copyright (c) 2018 ARM Limited. + * Copyright (c) 2018-2019 ARM Limited. * * SPDX-License-Identifier: MIT * @@ -107,9 +107,16 @@ void CLWidthConcatenateLayerKernel::configure(const ICLTensor *input, unsigned i build_opts.add_option("-DWIDTH_OFFSET=" + support::cpp11::to_string(_width_offset)); build_opts.add_option("-DDEPTH=" + support::cpp11::to_string(input->info()->dimension(2))); + if(is_data_type_quantized_asymmetric(input->info()->data_type()) && input->info()->quantization_info() != output->info()->quantization_info()) + { + build_opts.add_option("-DOFFSET_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().offset)); + build_opts.add_option("-DOFFSET_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().offset)); + build_opts.add_option("-DSCALE_IN1=" + float_to_string_with_full_precision(input->info()->quantization_info().scale)); + build_opts.add_option("-DSCALE_OUT=" + float_to_string_with_full_precision(output->info()->quantization_info().scale)); + } + // Create kernel _kernel = static_cast<cl::Kernel>(CLKernelLibrary::get().create_kernel("concatenate_width", build_opts.options())); - // Configure kernel window auto win_config = validate_and_configure_window(input->info(), width_offset, output->info()); ARM_COMPUTE_ERROR_THROW_ON(std::get<0>(win_config)); |