diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/concatenate.cl | 59 |
1 files changed, 52 insertions, 7 deletions
diff --git a/src/core/CL/cl_kernels/concatenate.cl b/src/core/CL/cl_kernels/concatenate.cl index dc381803e6..c374769423 100644 --- a/src/core/CL/cl_kernels/concatenate.cl +++ b/src/core/CL/cl_kernels/concatenate.cl @@ -23,8 +23,22 @@ */ #include "helpers.h" -#if defined(DATA_TYPE) && defined(VEC_SIZE) +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) +#define VEC_FLOAT VEC_DATA_TYPE(float, VEC_SIZE) +#define VEC_INT VEC_DATA_TYPE(int, VEC_SIZE) +#define VEC_UCHAR VEC_DATA_TYPE(uchar, VEC_SIZE) +#define CONVERT_RTE(x, type) (convert_##type##_rte((x))) +#define CONVERT_DOWN(x, type) CONVERT_RTE(x, type) +inline VEC_UCHAR requantize(VEC_UCHAR input, float in_offset, float out_offset, float in_scale, float out_scale) +{ + const VEC_FLOAT in_f32 = (CONVERT(input, VEC_FLOAT) - (VEC_FLOAT)((float)in_offset)) * (VEC_FLOAT)((float)in_scale); + const VEC_FLOAT out_f32 = in_f32 / ((VEC_FLOAT)(float)out_scale) + ((VEC_FLOAT)((float)out_offset)); + const VEC_UCHAR res_u8 = CONVERT_SAT(CONVERT_DOWN(out_f32, VEC_INT), VEC_UCHAR); + return res_u8; +} +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ +#if defined(DATA_TYPE) && defined(VEC_SIZE) #if defined(DEPTH) && defined(ELEMENT_SIZE) #if defined(INPUT1_WIDTH) @@ -50,6 +64,7 @@ #else // VEC_SIZE #error "Vector size not supported" #endif // VEC_SIZE + /** This kernel concatenates two input tensors into the output tensor along the first dimension * * @note The data type has to be passed at compile time using -DDATA_TYPE. i.e. -DDATA_TYPE=float @@ -112,9 +127,15 @@ __kernel void concatenate_width_x2( const __global uchar *in1_ptr = src1_ptr + (int)src1_offset_first_element_in_bytes + x1 * (int)src1_stride_x + y * (int)src1_stride_y + z * (int)src1_stride_z + w * (int)src1_stride_w; const __global uchar *in2_ptr = src2_ptr + (int)src2_offset_first_element_in_bytes + x2 * (int)src2_stride_x + y * (int)src2_stride_y + z * (int)src2_stride_z + w * (int)src2_stride_w; - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) + src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); + src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) */ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x); const VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE) cond = CONVERT(x_coords < (VEC_DATA_TYPE(int, VEC_SIZE))(INPUT1_WIDTH), VEC_DATA_TYPE(COND_DATA_TYPE, VEC_SIZE)); const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) values = select(src2_values, src1_values, cond); @@ -223,10 +244,21 @@ __kernel void concatenate_width_x4( const __global uchar *in3_ptr = src3_ptr + (int)src3_offset_first_element_in_bytes + x3 * (int)src3_stride_x + y * (int)src3_stride_y + z * (int)src3_stride_z + w * (int)src3_stride_w; const __global uchar *in4_ptr = src4_ptr + (int)src4_offset_first_element_in_bytes + x4 * (int)src4_stride_x + y * (int)src4_stride_y + z * (int)src4_stride_z + w * (int)src4_stride_w; - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr); - const VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src1_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in1_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src2_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in2_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src3_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in3_ptr); + VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) + src4_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)in4_ptr); + +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) + src1_values = requantize(src1_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); + src2_values = requantize(src2_values, OFFSET_IN2, OFFSET_OUT, SCALE_IN2, SCALE_OUT); + src3_values = requantize(src3_values, OFFSET_IN3, OFFSET_OUT, SCALE_IN3, SCALE_OUT); + src4_values = requantize(src4_values, OFFSET_IN4, OFFSET_OUT, SCALE_IN4, SCALE_OUT); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) && defined(OFFSET_IN2) && defined(SCALE_IN2) && defined(OFFSET_IN3) && defined(SCALE_IN3) && defined(OFFSET_IN4) && defined(SCALE_IN4) */ const VEC_DATA_TYPE(int, VEC_SIZE) x_coords = SEQ + (VEC_DATA_TYPE(int, VEC_SIZE))(x); @@ -275,6 +307,7 @@ __kernel void concatenate_width_x4( * @param[in] dst_step_w output_stride_z * number of elements along Z processed per workitem(in bytes) * @param[in] dst_offset_first_element_in_bytes The offset of the first element in the destination tensor */ + __kernel void concatenate_width( TENSOR4D_DECLARATION(src), TENSOR4D_DECLARATION(dst)) @@ -285,9 +318,16 @@ __kernel void concatenate_width( VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)src.ptr); +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + const VEC_UCHAR out = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); + VSTORE(VEC_SIZE) + (out, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET); +#else /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ VSTORE(VEC_SIZE) (source_values, 0, (__global DATA_TYPE *)(dst.ptr) + WIDTH_OFFSET); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ } + #endif /* defined(WIDTH_OFFSET) && defined(DEPTH) */ /** This kernel concatenates the input tensor into the output tensor along the third dimension @@ -324,7 +364,12 @@ __kernel void concatenate_depth( VEC_DATA_TYPE(DATA_TYPE, VEC_SIZE) source_values = VLOAD(VEC_SIZE)(0, (__global DATA_TYPE *)tensor3D_offset(&src, -offsets.x, -offsets.y, 0)); +#if defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) + source_values = requantize(source_values, OFFSET_IN1, OFFSET_OUT, SCALE_IN1, SCALE_OUT); +#endif /* defined(OFFSET_IN1) && defined(OFFSET_OUT) && defined(SCALE_IN1) && defined(SCALE_OUT) */ + VSTORE(VEC_SIZE) (source_values, 0, (__global DATA_TYPE *)(dst.ptr + offsets.z)); + } #endif /* defined(DATA_TYPE) && defined(VEC_SIZE) */ |