diff options
Diffstat (limited to 'src/core/CL/cl_kernels')
-rw-r--r-- | src/core/CL/cl_kernels/reduction_operation.cl | 144 |
1 files changed, 114 insertions, 30 deletions
diff --git a/src/core/CL/cl_kernels/reduction_operation.cl b/src/core/CL/cl_kernels/reduction_operation.cl index d76e12ac04..d1f47beda7 100644 --- a/src/core/CL/cl_kernels/reduction_operation.cl +++ b/src/core/CL/cl_kernels/reduction_operation.cl @@ -60,7 +60,7 @@ inline DATA_TYPE sum(__global const DATA_TYPE *input) return (in.s0 + in.s1); } - +#if defined(OPERATION) /** This kernel performs parallel reduction given an operation on x-axis. * * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float @@ -120,13 +120,16 @@ __kernel void reduction_operation_x( } } } +#endif // defined(OPERATION) #if defined(WIDTH) -/** This kernel performs reduction on x-axis. (QASYMM8) +/** This kernel performs reduction on x-axis. (Non parallel) * + * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The width size must be passed at compile time using -DWIDTH e.g. -DWIDTH=128 + * @note In case of ARG_MIN and ARG_MAX the condition data type must be passed at compile time using -DCOND_DATA_TYPE e.g. -DCOND_DATA_TYPE=short * - * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8 + * @param[in] src_ptr Pointer to the source tensor. Supported data types: F16/F32 and QASYMM8 for operation MEAN * @param[in] src_stride_x Stride of the source tensor in X dimension (in bytes) * @param[in] src_step_x src_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] src_offset_first_element_in_bytes The offset of the first element in the source tensor @@ -135,33 +138,49 @@ __kernel void reduction_operation_x( * @param[in] output_step_x output_stride_x * number of elements along X processed per workitem(in bytes) * @param[in] output_offset_first_element_in_bytes The offset of the first element in the source tensor */ -__kernel void reduction_operation_quantized_x( +__kernel void reduction_operation_non_parallel_x( VECTOR_DECLARATION(src), VECTOR_DECLARATION(output)) { Vector src = CONVERT_TO_VECTOR_STRUCT(src); Vector output = CONVERT_TO_VECTOR_STRUCT(output); - uint res = 0; + DATA_TYPE_PROMOTED res = *((__global DATA_TYPE *)vector_offset(&src, 0)); - for(unsigned int x = 0; x < WIDTH; ++x) +#if defined(ARG_MAX) || defined(ARG_MIN) + uint indx = 0; +#endif // defined(ARG_MAX) || defined(ARG_MIN) + + for(unsigned int x = 1; x < WIDTH; ++x) { - res += *((__global uchar *)vector_offset(&src, x)); + DATA_TYPE_PROMOTED in = *((__global DATA_TYPE *)vector_offset(&src, x)); +#if defined(ARG_MAX) + indx = select(indx, x, isgreater(in, res)); + res = select(res, in, CONVERT(isgreater(in, res), COND_DATA_TYPE)); +#elif defined(ARG_MIN) + indx = select(indx, x, isless(in, res)); + res = select(res, in, CONVERT(isless(in, res), COND_DATA_TYPE)); +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) + res += in; +#endif // defined(ARG_MAX) || defined(ARG_MIN) } + // Store result +#if defined(ARG_MAX) || defined(ARG_MIN) + *((__global uint *)output.ptr) = indx; +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) #if defined(MEAN) res /= WIDTH; -#endif /* defined(MEAN) */ - - // Store result +#endif // defined(MEAN) *((__global uchar *)output.ptr) = convert_uchar(res); +#endif // defined(ARG_MAX) || defined(ARG_MIN) } -#endif /* defined(HEIGHT) */ +#endif /* defined(WIDTH) */ #if defined(HEIGHT) /** This kernel performs reduction on y-axis. * - * @note The data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float + * @note The input data type must be passed at compile time using -DDATA_TYPE: e.g. -DDATA_TYPE=float * @note The height size must be passed at compile time using -DHEIGHT e.g. -DHEIGHT=128 * * @param[in] src_ptr Pointer to the source tensor. Supported data types: QASYMM8/F16/F32 @@ -185,24 +204,45 @@ __kernel void reduction_operation_y( Image output = CONVERT_TO_IMAGE_STRUCT(output); VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) - res = 0; + res = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + +#if defined(SUM_SQUARE) + res *= res; +#endif // defined(SUM_SQUARE) + +#if defined(ARG_MAX) || defined(ARG_MIN) + uint16 indx = 0; +#endif // defined(ARG_MAX) || defined(ARG_MIN) - for(unsigned int y = 0; y < HEIGHT; ++y) + for(unsigned int y = 1; y < HEIGHT; ++y) { VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) in = CONVERT(vload16(0, (__global DATA_TYPE *)offset(&src, 0, y)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); +#if defined(ARG_MAX) + uint16 cond_conv = CONVERT(isgreater(in, res), uint16); + indx = select(indx, y, cond_conv); + res = select(res, in, isgreater(in, res)); +#elif defined(ARG_MIN) + uint16 cond_conv = CONVERT(isless(in, res), uint16); + indx = select(indx, y, cond_conv); + res = select(res, in, isless(in, res)); +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) #if defined(SUM_SQUARE) in *= in; -#endif // SQRSUM +#endif // defined(SUM_SQUARE) res += in; +#endif // defined(ARG_MAX) || defined(ARG_MIN) } + // Store result +#if defined(ARG_MAX) || defined(ARG_MIN) + vstore16(indx, 0, (__global uint *)output.ptr); +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) #if defined(MEAN) res /= HEIGHT; -#endif /* defined(MEAN) */ - - // Store result +#endif // defined(MEAN) vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +#endif // defined(ARG_MAX) || defined(ARG_MIN) } #endif /* defined(HEIGHT) */ @@ -237,24 +277,46 @@ __kernel void reduction_operation_z( Tensor3D output = CONVERT_TO_TENSOR3D_STRUCT(output); VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) - res = 0; + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); - for(unsigned int z = 0; z < DEPTH; ++z) +#if defined(SUM_SQUARE) + res *= res; +#endif // defined(SUM_SQUARE) + +#if defined(ARG_MAX) || defined(ARG_MIN) + uint16 indx = 0; +#endif // defined(ARG_MAX) || defined(ARG_MIN) + + for(unsigned int z = 1; z < DEPTH; ++z) { VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor3D_offset(&input, 0, 0, z)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + +#if defined(ARG_MAX) + uint16 cond_conv = CONVERT(isgreater(in, res), uint16); + indx = select(indx, z, cond_conv); + res = select(res, in, isgreater(in, res)); +#elif defined(ARG_MIN) + uint16 cond_conv = CONVERT(isless(in, res), uint16); + indx = select(indx, z, cond_conv); + res = select(res, in, isless(in, res)); +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) #if defined(SUM_SQUARE) in *= in; -#endif // SQRSUM +#endif // defined(SUM_SQUARE) res += in; +#endif // defined(ARG_MAX) || defined(ARG_MIN) } + // Store result +#if defined(ARG_MAX) || defined(ARG_MIN) + vstore16(indx, 0, (__global uint *)output.ptr); +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) #if defined(MEAN) res /= DEPTH; -#endif /* defined(MEAN) */ - - // Store result +#endif // defined(MEAN) vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +#endif // defined(ARG_MAX) || defined(ARG_MIN) } #endif /* defined(DEPTH) */ @@ -294,23 +356,45 @@ __kernel void reduction_operation_w( Tensor4D output = CONVERT_TO_TENSOR4D_STRUCT(output, DEPTH); VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) - res = 0; + res = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, 0)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); - for(unsigned int w = 0; w < BATCH; ++w) +#if defined(SUM_SQUARE) + res *= res; +#endif // defined(SUM_SQUARE) + +#if defined(ARG_MAX) || defined(ARG_MIN) + uint16 indx = 0; +#endif // defined(ARG_MAX) || defined(ARG_MIN) + + for(unsigned int w = 1; w < BATCH; ++w) { VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16) in = CONVERT(vload16(0, (__global DATA_TYPE *)tensor4D_offset(&input, 0, 0, 0, w)), VEC_DATA_TYPE(DATA_TYPE_PROMOTED, 16)); + +#if defined(ARG_MAX) + uint16 cond_conv = CONVERT(isgreater(in, res), uint16); + indx = select(indx, w, cond_conv); + res = select(res, in, isgreater(in, res)); +#elif defined(ARG_MIN) + uint16 cond_conv = CONVERT(isless(in, res), uint16); + indx = select(indx, w, cond_conv); + res = select(res, in, isless(in, res)); +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) #if defined(SUM_SQUARE) in *= in; -#endif // SQRSUM +#endif // defined(SUM_SQUARE) res += in; +#endif // defined(ARG_MAX) || defined(ARG_MIN) } + // Store result +#if defined(ARG_MAX) || defined(ARG_MIN) + vstore16(indx, 0, (__global uint *)output.ptr); +#else // !(defined(ARG_MAX) || defined(ARG_MIN)) #if defined(MEAN) res /= BATCH; -#endif /* defined(MEAN) */ - - // Store result +#endif // defined(MEAN) vstore16(CONVERT(res, VEC_DATA_TYPE(DATA_TYPE, 16)), 0, (__global DATA_TYPE *)output.ptr); +#endif // defined(ARG_MAX) || defined(ARG_MIN) } #endif /* defined(BATCH) && defined(DEPTH) */
\ No newline at end of file |